[llvm] r369876 - [X86] Automatically generate stack folding tests. NFC
Amaury Sechet via llvm-commits
llvm-commits at lists.llvm.org
Sun Aug 25 13:48:14 PDT 2019
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll?rev=369876&r1=369875&r2=369876&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll Sun Aug 25 13:48:14 2019
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,16 +10,45 @@ target triple = "x86_64-unknown-unknown"
; relevant registers and check that the reload is correctly folded into the instruction.
define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
- ;CHECK-LABEL: stack_fold_valignd
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) {
- ;CHECK-LABEL: stack_fold_valignd_mask
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
%3 = bitcast i16 %mask to <16 x i1>
@@ -28,8 +58,22 @@ define <16 x i32> @stack_fold_valignd_ma
}
define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: stack_fold_valignd_maskz
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
%3 = bitcast i16 %mask to <16 x i1>
@@ -38,16 +82,45 @@ define <16 x i32> @stack_fold_valignd_ma
}
define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
- ;CHECK-LABEL: stack_fold_valignq
- ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7],zmm0[0]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) {
- ;CHECK-LABEL: stack_fold_valignq_mask
- ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7],zmm0[0]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%3 = bitcast i8 %mask to <8 x i1>
@@ -57,8 +130,22 @@ define <8 x i64> @stack_fold_valignq_mas
}
define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: stack_fold_valignq_maskz
- ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7],zmm0[0]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%3 = bitcast i8 %mask to <8 x i1>
@@ -67,8 +154,14 @@ define <8 x i64> @stack_fold_valignq_mas
}
define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pavgb
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
@@ -76,16 +169,31 @@ define <64 x i8> @stack_fold_pavgb(<64 x
declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>)
define <64 x i8> @stack_fold_pavgb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pavgb_commuted:
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_pavgb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pavgb_mask
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -96,8 +204,17 @@ define <64 x i8> @stack_fold_pavgb_mask(
}
define <64 x i8> @stack_fold_pavgb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pavgb_mask_commuted:
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -108,8 +225,15 @@ define <64 x i8> @stack_fold_pavgb_mask_
}
define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pavgb_maskz
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -118,8 +242,15 @@ define <64 x i8> @stack_fold_pavgb_maskz
}
define <64 x i8> @stack_fold_pavgb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -128,8 +259,14 @@ define <64 x i8> @stack_fold_pavgb_maskz
}
define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pavgw
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
@@ -137,16 +274,31 @@ define <32 x i16> @stack_fold_pavgw(<32
declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>)
define <32 x i16> @stack_fold_pavgw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pavgw_commuted:
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_pavgw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pavgw_mask
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -157,8 +309,17 @@ define <32 x i16> @stack_fold_pavgw_mask
}
define <32 x i16> @stack_fold_pavgw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pavgw_mask_commuted:
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -169,8 +330,15 @@ define <32 x i16> @stack_fold_pavgw_mask
}
define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pavgw_maskz
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -179,8 +347,15 @@ define <32 x i16> @stack_fold_pavgw_mask
}
define <32 x i16> @stack_fold_pavgw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -189,8 +364,16 @@ define <32 x i16> @stack_fold_pavgw_mask
}
define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_extracti32x4
- ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti32x4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <16 x i16> %a0 to <16 x i32>
%2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
@@ -199,8 +382,16 @@ define <4 x i32> @stack_fold_extracti32x
}
define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_extracti64x2
- ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti64x2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vextracti32x4 $3, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <8 x i32> %a0 to <8 x i64>
%2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
@@ -209,8 +400,15 @@ define <2 x i64> @stack_fold_extracti64x
}
define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_extracti32x8
- ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti32x8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <16 x i16> %a0 to <16 x i32>
%2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -219,8 +417,15 @@ define <8 x i32> @stack_fold_extracti32x
}
define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_extracti64x4
- ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti64x4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <8 x i32> %a0 to <8 x i64>
%2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -229,8 +434,17 @@ define <4 x i64> @stack_fold_extracti64x
}
define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_inserti32x8
- ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_inserti32x8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; add forces execution domain
@@ -239,8 +453,17 @@ define <16 x i32> @stack_fold_inserti32x
}
define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_inserti64x4
- ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_inserti64x4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; add forces execution domain
@@ -249,8 +472,14 @@ define <8 x i64> @stack_fold_inserti64x4
}
define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pabsb
- ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, zeroinitializer
%3 = sub <64 x i8> zeroinitializer, %a0
@@ -259,8 +488,21 @@ define <64 x i8> @stack_fold_pabsb(<64 x
}
define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pabsb_mask
- ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, zeroinitializer
%3 = sub <64 x i8> zeroinitializer, %a0
@@ -271,8 +513,15 @@ define <64 x i8> @stack_fold_pabsb_mask(
}
define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pabsb_maskz
- ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, zeroinitializer
%3 = sub <64 x i8> zeroinitializer, %a0
@@ -283,8 +532,14 @@ define <64 x i8> @stack_fold_pabsb_maskz
}
define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pabsd
- ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, zeroinitializer
%3 = sub <16 x i32> zeroinitializer, %a0
@@ -293,8 +548,21 @@ define <16 x i32> @stack_fold_pabsd(<16
}
define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pabsd_mask
- ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, zeroinitializer
%3 = sub <16 x i32> zeroinitializer, %a0
@@ -305,8 +573,15 @@ define <16 x i32> @stack_fold_pabsd_mask
}
define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pabsd_maskz
- ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, zeroinitializer
%3 = sub <16 x i32> zeroinitializer, %a0
@@ -317,8 +592,14 @@ define <16 x i32> @stack_fold_pabsd_mask
}
define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_pabsq
- ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, zeroinitializer
%3 = sub <8 x i64> zeroinitializer, %a0
@@ -327,8 +608,21 @@ define <8 x i64> @stack_fold_pabsq(<8 x
}
define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pabsq_mask
- ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, zeroinitializer
%3 = sub <8 x i64> zeroinitializer, %a0
@@ -339,8 +633,15 @@ define <8 x i64> @stack_fold_pabsq_mask(
}
define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pabsq_maskz
- ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, zeroinitializer
%3 = sub <8 x i64> zeroinitializer, %a0
@@ -351,8 +652,14 @@ define <8 x i64> @stack_fold_pabsq_maskz
}
define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pabsw
- ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, zeroinitializer
%3 = sub <32 x i16> zeroinitializer, %a0
@@ -361,8 +668,21 @@ define <32 x i16> @stack_fold_pabsw(<32
}
define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pabsw_mask
- ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, zeroinitializer
%3 = sub <32 x i16> zeroinitializer, %a0
@@ -373,8 +693,15 @@ define <32 x i16> @stack_fold_pabsw_mask
}
define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pabsw_maskz
- ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, zeroinitializer
%3 = sub <32 x i16> zeroinitializer, %a0
@@ -385,8 +712,14 @@ define <32 x i16> @stack_fold_pabsw_mask
}
define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packssdw
- ;CHECK: vpackssdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packssdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a0, <16 x i32> %a1)
ret <32 x i16> %2
@@ -394,8 +727,14 @@ define <32 x i16> @stack_fold_packssdw(<
declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packsswb
- ;CHECK: vpacksswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packsswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a0, <32 x i16> %a1)
ret <64 x i8> %2
@@ -403,8 +742,14 @@ define <64 x i8> @stack_fold_packsswb(<3
declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packusdw
- ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
ret <32 x i16> %2
@@ -412,8 +757,17 @@ define <32 x i16> @stack_fold_packusdw(<
declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_packusdw_mask
- ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x i16>, <32 x i16>* %passthru
%3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
@@ -423,8 +777,15 @@ define <32 x i16> @stack_fold_packusdw_m
}
define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_packusdw_maskz
- ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a0, <16 x i32> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -433,8 +794,14 @@ define <32 x i16> @stack_fold_packusdw_m
}
define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packuswb
- ;CHECK: vpackuswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_packuswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a0, <32 x i16> %a1)
ret <64 x i8> %2
@@ -442,24 +809,45 @@ define <64 x i8> @stack_fold_packuswb(<3
declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddb
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a0, %a1
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddb_commuted:
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a1, %a0
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_mask
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a0, %a1
%3 = bitcast i64 %mask to <64 x i1>
@@ -470,8 +858,17 @@ define <64 x i8> @stack_fold_paddb_mask(
}
define <64 x i8> @stack_fold_paddb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_mask_commuted:
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a1, %a0
%3 = bitcast i64 %mask to <64 x i1>
@@ -482,8 +879,15 @@ define <64 x i8> @stack_fold_paddb_mask_
}
define <64 x i8> @stack_fold_paddb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_maskz
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a0, %a1
%3 = bitcast i64 %mask to <64 x i1>
@@ -492,8 +896,15 @@ define <64 x i8> @stack_fold_paddb_maskz
}
define <64 x i8> @stack_fold_paddb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_maskz_commuted:
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <64 x i8> %a1, %a0
%3 = bitcast i64 %mask to <64 x i1>
@@ -502,24 +913,45 @@ define <64 x i8> @stack_fold_paddb_maskz
}
define <16 x i32> @stack_fold_paddd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_paddd
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_paddd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_paddd_commuted:
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a1, %a0
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_paddd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddd_mask
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -530,8 +962,17 @@ define <16 x i32> @stack_fold_paddd_mask
}
define <16 x i32> @stack_fold_paddd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddd_mask_commuted:
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -542,8 +983,15 @@ define <16 x i32> @stack_fold_paddd_mask
}
define <16 x i32> @stack_fold_paddd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddd_maskz
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -552,8 +1000,15 @@ define <16 x i32> @stack_fold_paddd_mask
}
define <16 x i32> @stack_fold_paddd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddd_maskz_commuted:
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -562,24 +1017,45 @@ define <16 x i32> @stack_fold_paddd_mask
}
define <8 x i64> @stack_fold_paddq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_paddq
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_paddq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_paddq_commuted:
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a1, %a0
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_paddq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_paddq_mask
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -590,8 +1066,17 @@ define <8 x i64> @stack_fold_paddq_mask(
}
define <8 x i64> @stack_fold_paddq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_paddq_mask_commuted:
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -602,8 +1087,15 @@ define <8 x i64> @stack_fold_paddq_mask_
}
define <8 x i64> @stack_fold_paddq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_paddq_maskz
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -612,8 +1104,15 @@ define <8 x i64> @stack_fold_paddq_maskz
}
define <8 x i64> @stack_fold_paddq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_paddq_maskz_commuted:
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -622,24 +1121,45 @@ define <8 x i64> @stack_fold_paddq_maskz
}
define <64 x i8> @stack_fold_paddsb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddsb
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddsb_commuted:
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddsb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddsb_mask
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -650,8 +1170,17 @@ define <64 x i8> @stack_fold_paddsb_mask
}
define <64 x i8> @stack_fold_paddsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddsb_mask_commuted:
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -662,8 +1191,15 @@ define <64 x i8> @stack_fold_paddsb_mask
}
define <64 x i8> @stack_fold_paddsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddsb_maskz
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -672,8 +1208,15 @@ define <64 x i8> @stack_fold_paddsb_mask
}
define <64 x i8> @stack_fold_paddsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -682,24 +1225,45 @@ define <64 x i8> @stack_fold_paddsb_mask
}
define <32 x i16> @stack_fold_paddsw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddsw
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddsw_commuted:
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddsw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddsw_mask
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -710,8 +1274,17 @@ define <32 x i16> @stack_fold_paddsw_mas
}
define <32 x i16> @stack_fold_paddsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddsw_mask_commuted:
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -722,8 +1295,15 @@ define <32 x i16> @stack_fold_paddsw_mas
}
define <32 x i16> @stack_fold_paddsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddsw_maskz
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -732,8 +1312,15 @@ define <32 x i16> @stack_fold_paddsw_mas
}
define <32 x i16> @stack_fold_paddsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -742,24 +1329,45 @@ define <32 x i16> @stack_fold_paddsw_mas
}
define <64 x i8> @stack_fold_paddusb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddusb
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddusb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddusb_commuted:
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_paddusb_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddusb_mask
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -770,8 +1378,17 @@ define <64 x i8> @stack_fold_paddusb_mas
}
define <64 x i8> @stack_fold_paddusb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %a2, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddusb_mask_commuted:
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -782,8 +1399,15 @@ define <64 x i8> @stack_fold_paddusb_mas
}
define <64 x i8> @stack_fold_paddusb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddusb_maskz
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -792,8 +1416,15 @@ define <64 x i8> @stack_fold_paddusb_mas
}
define <64 x i8> @stack_fold_paddusb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -802,24 +1433,45 @@ define <64 x i8> @stack_fold_paddusb_mas
}
define <32 x i16> @stack_fold_paddusw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddusw
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddusw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddusw_commuted:
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddusw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddusw_mask
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -830,8 +1482,17 @@ define <32 x i16> @stack_fold_paddusw_ma
}
define <32 x i16> @stack_fold_paddusw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddusw_mask_commuted:
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -842,8 +1503,15 @@ define <32 x i16> @stack_fold_paddusw_ma
}
define <32 x i16> @stack_fold_paddusw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddusw_maskz
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -852,8 +1520,15 @@ define <32 x i16> @stack_fold_paddusw_ma
}
define <32 x i16> @stack_fold_paddusw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -862,24 +1537,45 @@ define <32 x i16> @stack_fold_paddusw_ma
}
define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddw
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a0, %a1
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddw_commuted:
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a1, %a0
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_paddw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddw_mask
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -890,8 +1586,17 @@ define <32 x i16> @stack_fold_paddw_mask
}
define <32 x i16> @stack_fold_paddw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddw_mask_commuted:
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
@@ -902,8 +1607,15 @@ define <32 x i16> @stack_fold_paddw_mask
}
define <32 x i16> @stack_fold_paddw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddw_maskz
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -912,8 +1624,15 @@ define <32 x i16> @stack_fold_paddw_mask
}
define <32 x i16> @stack_fold_paddw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddw_maskz_commuted:
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i16> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
@@ -922,16 +1641,45 @@ define <32 x i16> @stack_fold_paddw_mask
}
define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_palignr
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) {
- ;CHECK-LABEL: stack_fold_palignr_mask
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
%3 = bitcast i64 %mask to <64 x i1>
@@ -941,8 +1689,22 @@ define <64 x i8> @stack_fold_palignr_mas
}
define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_palignr_maskz
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16],mem[33,34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32],mem[49,50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
%3 = bitcast i64 %mask to <64 x i1>
@@ -951,24 +1713,45 @@ define <64 x i8> @stack_fold_palignr_mas
}
define <16 x i32> @stack_fold_pandd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pandd
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pandd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pandd_commuted:
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a1, %a0
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pandd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pandd_mask
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -979,8 +1762,17 @@ define <16 x i32> @stack_fold_pandd_mask
}
define <16 x i32> @stack_fold_pandd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pandd_mask_commuted:
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -991,8 +1783,15 @@ define <16 x i32> @stack_fold_pandd_mask
}
define <16 x i32> @stack_fold_pandd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pandd_maskz
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -1001,8 +1800,15 @@ define <16 x i32> @stack_fold_pandd_mask
}
define <16 x i32> @stack_fold_pandd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pandd_maskz_commuted:
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandd_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -1011,24 +1817,45 @@ define <16 x i32> @stack_fold_pandd_mask
}
define <8 x i64> @stack_fold_pandq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pandq
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pandq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pandq_commuted:
- ;CHECK: vandps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a1, %a0
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pandq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pandq_mask
- ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -1039,8 +1866,17 @@ define <8 x i64> @stack_fold_pandq_mask(
}
define <8 x i64> @stack_fold_pandq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pandq_mask_commuted:
- ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -1051,8 +1887,15 @@ define <8 x i64> @stack_fold_pandq_mask_
}
define <8 x i64> @stack_fold_pandq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pandq_maskz
- ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -1061,8 +1904,15 @@ define <8 x i64> @stack_fold_pandq_maskz
}
define <8 x i64> @stack_fold_pandq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pandq_maskz_commuted:
- ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vandpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -1071,8 +1921,14 @@ define <8 x i64> @stack_fold_pandq_maskz
}
define <16 x i32> @stack_fold_vpconflictd(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictd
- ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a0)
ret <16 x i32> %2
@@ -1080,8 +1936,14 @@ define <16 x i32> @stack_fold_vpconflict
declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
define <8 x i64> @stack_fold_vpconflictq(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictq
- ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.conflict.q.512(<8 x i64> %a0)
ret <8 x i64> %2
@@ -1089,8 +1951,16 @@ define <8 x i64> @stack_fold_vpconflictq
declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqb
- ;CHECK: vpcmpeqb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <64 x i8> %a0, %a1
%3 = bitcast <64 x i1> %2 to i64
@@ -1098,8 +1968,17 @@ define i64 @stack_fold_pcmpeqb(<64 x i8>
}
define i16 @stack_fold_pcmpeqd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqd
- ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <16 x i32> %a0, %a1
%3 = bitcast <16 x i1> %2 to i16
@@ -1107,8 +1986,17 @@ define i16 @stack_fold_pcmpeqd(<16 x i32
}
define i8 @stack_fold_pcmpeqq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqq
- ;CHECK: vpcmpeqq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <8 x i64> %a0, %a1
%3 = bitcast <8 x i1> %2 to i8
@@ -1116,8 +2004,16 @@ define i8 @stack_fold_pcmpeqq(<8 x i64>
}
define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqw
- ;CHECK: vpcmpeqw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <32 x i16> %a0, %a1
%3 = bitcast <32 x i1> %2 to i32
@@ -1125,8 +2021,26 @@ define i32 @stack_fold_pcmpeqw(<32 x i16
}
define <16 x i32> @stack_fold_pcmpeqd_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
- ;CHECK-LABEL: stack_fold_pcmpeqd_mask
- ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 192
+; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
%2 = load <16 x i32>, <16 x i32>* %a2
@@ -1139,8 +2053,26 @@ define <16 x i32> @stack_fold_pcmpeqd_ma
}
define <16 x i32> @stack_fold_pcmpeqd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
- ;CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted
- ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 192
+; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
%2 = load <16 x i32>, <16 x i32>* %a2
@@ -1153,8 +2085,26 @@ define <16 x i32> @stack_fold_pcmpeqd_ma
}
define <16 x i32> @stack_fold_pcmpled_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask, <16 x i32> %b0, <16 x i32> %b1) {
- ;CHECK-LABEL: stack_fold_pcmpled_mask
- ;CHECK: vpcmpled {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpled_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 192
+; CHECK-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm2, (%rsp) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcmpled {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpblendmd (%rsp), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: addq $184, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
; load and add are here to keep the operations below the side effecting block and to avoid folding the wrong load
%2 = load <16 x i32>, <16 x i32>* %a2
@@ -1167,8 +2117,25 @@ define <16 x i32> @stack_fold_pcmpled_ma
}
define i16 @stack_fold_pcmpleud(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pcmpleud
- ;CHECK: vpcmpleud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpleud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: vpcmpleud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 # 64-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: andl %esi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i32>, <16 x i32>* %a2
%3 = add <16 x i32> %a1, %2
@@ -1180,8 +2147,14 @@ define i16 @stack_fold_pcmpleud(<16 x i3
}
define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_permbvar
- ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permbvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
ret <64 x i8> %2
@@ -1189,8 +2162,17 @@ define <64 x i8> @stack_fold_permbvar(<6
declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) nounwind readonly
define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_permbvar_mask
- ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permbvar_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -1201,8 +2183,15 @@ define <64 x i8> @stack_fold_permbvar_ma
}
define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_permbvar_maskz
- ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permbvar_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0)
%3 = bitcast i64 %mask to <64 x i1>
@@ -1211,8 +2200,16 @@ define <64 x i8> @stack_fold_permbvar_ma
}
define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_permd
- ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0)
; add forces execution domain
@@ -1222,40 +2219,73 @@ define <16 x i32> @stack_fold_permd(<16
declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) nounwind readonly
define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2b
- ;CHECK: vpermi2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2)
ret <64 x i8> %2
}
define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2d
- ;CHECK: vpermi2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2q
- ;CHECK: vpermi2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2q:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
ret <8 x i64> %2
}
define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2w
- ;CHECK: vpermi2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
ret <32 x i16> %2
}
define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_permq
- ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
; add forces execution domain
@@ -1264,8 +2294,19 @@ define <8 x i64> @stack_fold_permq(<8 x
}
define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_permq_mask
- ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,2,3,7,6,6,7]
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1278,8 +2319,16 @@ define <8 x i64> @stack_fold_permq_mask(
}
define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_permq_maskz
- ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,2,3,7,6,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1288,8 +2337,16 @@ define <8 x i64> @stack_fold_permq_maskz
}
define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_permqvar
- ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permqvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
; add forces execution domain
@@ -1299,8 +2356,18 @@ define <8 x i64> @stack_fold_permqvar(<8
declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) nounwind readonly
define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_permqvar_mask
- ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permqvar_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vpsubq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0)
%3 = bitcast i8 %mask to <8 x i1>
@@ -1313,8 +2380,14 @@ define <8 x i64> @stack_fold_permqvar_ma
}
define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2b
- ;CHECK: vpermt2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
ret <64 x i8> %2
@@ -1322,8 +2395,14 @@ define <64 x i8> @stack_fold_vpermt2b(<6
declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>)
define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2d
- ;CHECK: vpermt2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
ret <16 x i32> %2
@@ -1331,8 +2410,14 @@ define <16 x i32> @stack_fold_vpermt2d(<
declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2q
- ;CHECK: vpermt2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2q:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
ret <8 x i64> %2
@@ -1340,8 +2425,14 @@ define <8 x i64> @stack_fold_vpermt2q(<8
declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2w
- ;CHECK: vpermt2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
ret <32 x i16> %2
@@ -1349,8 +2440,14 @@ define <32 x i16> @stack_fold_vpermt2w(<
declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_permwvar
- ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permwvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
ret <32 x i16> %2
@@ -1358,8 +2455,17 @@ define <32 x i16> @stack_fold_permwvar(<
declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) nounwind readonly
define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_permwvar_mask
- ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permwvar_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -1370,8 +2476,15 @@ define <32 x i16> @stack_fold_permwvar_m
}
define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_permwvar_maskz
- ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_permwvar_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i32 %mask to <32 x i1>
@@ -1380,9 +2493,45 @@ define <32 x i16> @stack_fold_permwvar_m
}
define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pextrd
- ;CHECK: vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
- ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+; CHECK-LABEL: stack_fold_pextrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
; add forces execution domain
%1 = add <4 x i32> %a0, %a1
%2 = extractelement <4 x i32> %1, i32 1
@@ -1391,65 +2540,262 @@ define i32 @stack_fold_pextrd(<4 x i32>
}
define i64 @stack_fold_pextrq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_pextrq
- ;CHECK: vpextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
- ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+; CHECK-LABEL: stack_fold_pextrq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: vpextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = extractelement <2 x i64> %a0, i32 1
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
ret i64 %1
}
define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrb
- ;CHECK: vpinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrd
- ;CHECK: vpinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
ret <4 x i32> %2
}
define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrq
- ;CHECK: vpinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
ret <2 x i64> %2
}
define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrw
- ;CHECK: vpinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
ret <8 x i16> %2
}
define <16 x i32> @stack_fold_vplzcntd(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntd
- ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a0, i1 false)
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_vplzcntq(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntq
- ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a0, i1 false)
ret <8 x i64> %2
}
define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_zmm
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
ret <32 x i16> %2
@@ -1457,8 +2803,17 @@ define <32 x i16> @stack_fold_pmaddubsw_
declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>) nounwind readnone
define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -1469,8 +2824,15 @@ define <32 x i16> @stack_fold_pmaddubsw_
}
define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -1479,8 +2841,14 @@ define <32 x i16> @stack_fold_pmaddubsw_
}
define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
ret <16 x i32> %2
@@ -1488,16 +2856,31 @@ define <16 x i32> @stack_fold_pmaddwd_zm
declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>) nounwind readnone
define <16 x i32> @stack_fold_pmaddwd_zmm_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -1508,8 +2891,17 @@ define <16 x i32> @stack_fold_pmaddwd_zm
}
define <16 x i32> @stack_fold_pmaddwd_zmm_mask_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i16 %mask to <16 x i1>
@@ -1520,8 +2912,15 @@ define <16 x i32> @stack_fold_pmaddwd_zm
}
define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -1530,8 +2929,15 @@ define <16 x i32> @stack_fold_pmaddwd_zm
}
define <16 x i32> @stack_fold_pmaddwd_zmm_maskz_commuted(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %a1, <32 x i16> %a0)
%3 = bitcast i16 %mask to <16 x i1>
@@ -1540,8 +2946,14 @@ define <16 x i32> @stack_fold_pmaddwd_zm
}
define <64 x i8> @stack_fold_pmaxsb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsb
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1549,8 +2961,14 @@ define <64 x i8> @stack_fold_pmaxsb(<64
}
define <64 x i8> @stack_fold_pmaxsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsb_commuted:
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1558,8 +2976,17 @@ define <64 x i8> @stack_fold_pmaxsb_comm
}
define <64 x i8> @stack_fold_pmaxsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsb_mask:
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1571,8 +2998,17 @@ define <64 x i8> @stack_fold_pmaxsb_mask
}
define <64 x i8> @stack_fold_pmaxsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1584,8 +3020,15 @@ define <64 x i8> @stack_fold_pmaxsb_mask
}
define <64 x i8> @stack_fold_pmaxsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsb_maskz:
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1595,8 +3038,15 @@ define <64 x i8> @stack_fold_pmaxsb_mask
}
define <64 x i8> @stack_fold_pmaxsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1606,8 +3056,14 @@ define <64 x i8> @stack_fold_pmaxsb_mask
}
define <16 x i32> @stack_fold_pmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsd
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1615,8 +3071,14 @@ define <16 x i32> @stack_fold_pmaxsd(<16
}
define <16 x i32> @stack_fold_pmaxsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsd_commuted:
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1624,8 +3086,17 @@ define <16 x i32> @stack_fold_pmaxsd_com
}
define <16 x i32> @stack_fold_pmaxsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsd_mask:
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1637,8 +3108,17 @@ define <16 x i32> @stack_fold_pmaxsd_mas
}
define <16 x i32> @stack_fold_pmaxsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1650,8 +3130,15 @@ define <16 x i32> @stack_fold_pmaxsd_mas
}
define <16 x i32> @stack_fold_pmaxsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsd_maskz:
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1661,8 +3148,15 @@ define <16 x i32> @stack_fold_pmaxsd_mas
}
define <16 x i32> @stack_fold_pmaxsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1672,8 +3166,14 @@ define <16 x i32> @stack_fold_pmaxsd_mas
}
define <8 x i64> @stack_fold_pmaxsq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsq
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1681,8 +3181,14 @@ define <8 x i64> @stack_fold_pmaxsq(<8 x
}
define <8 x i64> @stack_fold_pmaxsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsq_commuted:
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -1690,8 +3196,17 @@ define <8 x i64> @stack_fold_pmaxsq_comm
}
define <8 x i64> @stack_fold_pmaxsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsq_mask:
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1703,8 +3218,17 @@ define <8 x i64> @stack_fold_pmaxsq_mask
}
define <8 x i64> @stack_fold_pmaxsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -1716,8 +3240,15 @@ define <8 x i64> @stack_fold_pmaxsq_mask
}
define <8 x i64> @stack_fold_pmaxsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsq_maskz:
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1727,8 +3258,15 @@ define <8 x i64> @stack_fold_pmaxsq_mask
}
define <8 x i64> @stack_fold_pmaxsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -1738,8 +3276,14 @@ define <8 x i64> @stack_fold_pmaxsq_mask
}
define <32 x i16> @stack_fold_pmaxsw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsw
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -1747,8 +3291,14 @@ define <32 x i16> @stack_fold_pmaxsw(<32
}
define <32 x i16> @stack_fold_pmaxsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsw_commuted:
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -1756,8 +3306,17 @@ define <32 x i16> @stack_fold_pmaxsw_com
}
define <32 x i16> @stack_fold_pmaxsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsw_mask:
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -1769,8 +3328,17 @@ define <32 x i16> @stack_fold_pmaxsw_mas
}
define <32 x i16> @stack_fold_pmaxsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -1782,8 +3350,15 @@ define <32 x i16> @stack_fold_pmaxsw_mas
}
define <32 x i16> @stack_fold_pmaxsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsw_maskz:
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -1793,8 +3368,15 @@ define <32 x i16> @stack_fold_pmaxsw_mas
}
define <32 x i16> @stack_fold_pmaxsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -1804,8 +3386,14 @@ define <32 x i16> @stack_fold_pmaxsw_mas
}
define <64 x i8> @stack_fold_pmaxub(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxub
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1813,8 +3401,14 @@ define <64 x i8> @stack_fold_pmaxub(<64
}
define <64 x i8> @stack_fold_pmaxub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxub_commuted:
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1822,8 +3416,17 @@ define <64 x i8> @stack_fold_pmaxub_comm
}
define <64 x i8> @stack_fold_pmaxub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxub_mask:
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1835,8 +3438,17 @@ define <64 x i8> @stack_fold_pmaxub_mask
}
define <64 x i8> @stack_fold_pmaxub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1848,8 +3460,15 @@ define <64 x i8> @stack_fold_pmaxub_mask
}
define <64 x i8> @stack_fold_pmaxub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxub_maskz:
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -1859,8 +3478,15 @@ define <64 x i8> @stack_fold_pmaxub_mask
}
define <64 x i8> @stack_fold_pmaxub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -1870,8 +3496,14 @@ define <64 x i8> @stack_fold_pmaxub_mask
}
define <16 x i32> @stack_fold_pmaxud(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxud
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1879,8 +3511,14 @@ define <16 x i32> @stack_fold_pmaxud(<16
}
define <16 x i32> @stack_fold_pmaxud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxud_commuted:
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1888,8 +3526,17 @@ define <16 x i32> @stack_fold_pmaxud_com
}
define <16 x i32> @stack_fold_pmaxud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxud_mask:
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1901,8 +3548,17 @@ define <16 x i32> @stack_fold_pmaxud_mas
}
define <16 x i32> @stack_fold_pmaxud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1914,8 +3570,15 @@ define <16 x i32> @stack_fold_pmaxud_mas
}
define <16 x i32> @stack_fold_pmaxud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxud_maskz:
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -1925,8 +3588,15 @@ define <16 x i32> @stack_fold_pmaxud_mas
}
define <16 x i32> @stack_fold_pmaxud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -1936,8 +3606,14 @@ define <16 x i32> @stack_fold_pmaxud_mas
}
define <8 x i64> @stack_fold_pmaxuq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuq
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1945,8 +3621,14 @@ define <8 x i64> @stack_fold_pmaxuq(<8 x
}
define <8 x i64> @stack_fold_pmaxuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuq_commuted:
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -1954,8 +3636,17 @@ define <8 x i64> @stack_fold_pmaxuq_comm
}
define <8 x i64> @stack_fold_pmaxuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxuq_mask:
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1967,8 +3658,17 @@ define <8 x i64> @stack_fold_pmaxuq_mask
}
define <8 x i64> @stack_fold_pmaxuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -1980,8 +3680,15 @@ define <8 x i64> @stack_fold_pmaxuq_mask
}
define <8 x i64> @stack_fold_pmaxuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_maskz:
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -1991,8 +3698,15 @@ define <8 x i64> @stack_fold_pmaxuq_mask
}
define <8 x i64> @stack_fold_pmaxuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2002,8 +3716,14 @@ define <8 x i64> @stack_fold_pmaxuq_mask
}
define <32 x i16> @stack_fold_pmaxuw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuw
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2011,8 +3731,14 @@ define <32 x i16> @stack_fold_pmaxuw(<32
}
define <32 x i16> @stack_fold_pmaxuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuw_commuted:
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2020,8 +3746,17 @@ define <32 x i16> @stack_fold_pmaxuw_com
}
define <32 x i16> @stack_fold_pmaxuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxuw_mask:
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2033,8 +3768,17 @@ define <32 x i16> @stack_fold_pmaxuw_mas
}
define <32 x i16> @stack_fold_pmaxuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2046,8 +3790,15 @@ define <32 x i16> @stack_fold_pmaxuw_mas
}
define <32 x i16> @stack_fold_pmaxuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuw_maskz:
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2057,8 +3808,15 @@ define <32 x i16> @stack_fold_pmaxuw_mas
}
define <32 x i16> @stack_fold_pmaxuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2068,8 +3826,14 @@ define <32 x i16> @stack_fold_pmaxuw_mas
}
define <64 x i8> @stack_fold_pminsb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminsb
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2077,8 +3841,14 @@ define <64 x i8> @stack_fold_pminsb(<64
}
define <64 x i8> @stack_fold_pminsb_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminsb_commuted:
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2086,8 +3856,17 @@ define <64 x i8> @stack_fold_pminsb_comm
}
define <64 x i8> @stack_fold_pminsb_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsb_mask:
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2099,8 +3878,17 @@ define <64 x i8> @stack_fold_pminsb_mask
}
define <64 x i8> @stack_fold_pminsb_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsb_mask_commuted:
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2112,8 +3900,15 @@ define <64 x i8> @stack_fold_pminsb_mask
}
define <64 x i8> @stack_fold_pminsb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pminsb_maskz:
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2123,8 +3918,15 @@ define <64 x i8> @stack_fold_pminsb_mask
}
define <64 x i8> @stack_fold_pminsb_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2134,8 +3936,14 @@ define <64 x i8> @stack_fold_pminsb_mask
}
define <16 x i32> @stack_fold_pminsd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminsd
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2143,8 +3951,14 @@ define <16 x i32> @stack_fold_pminsd(<16
}
define <16 x i32> @stack_fold_pminsd_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminsd_commuted:
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2152,8 +3966,17 @@ define <16 x i32> @stack_fold_pminsd_com
}
define <16 x i32> @stack_fold_pminsd_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsd_mask:
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2165,8 +3988,17 @@ define <16 x i32> @stack_fold_pminsd_mas
}
define <16 x i32> @stack_fold_pminsd_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsd_mask_commuted:
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2178,8 +4010,15 @@ define <16 x i32> @stack_fold_pminsd_mas
}
define <16 x i32> @stack_fold_pminsd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pminsd_maskz:
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2189,8 +4028,15 @@ define <16 x i32> @stack_fold_pminsd_mas
}
define <16 x i32> @stack_fold_pminsd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2200,8 +4046,14 @@ define <16 x i32> @stack_fold_pminsd_mas
}
define <8 x i64> @stack_fold_pminsq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminsq
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2209,8 +4061,14 @@ define <8 x i64> @stack_fold_pminsq(<8 x
}
define <8 x i64> @stack_fold_pminsq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminsq_commuted:
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2218,8 +4076,17 @@ define <8 x i64> @stack_fold_pminsq_comm
}
define <8 x i64> @stack_fold_pminsq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsq_mask:
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2231,8 +4098,17 @@ define <8 x i64> @stack_fold_pminsq_mask
}
define <8 x i64> @stack_fold_pminsq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsq_mask_commuted:
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2244,8 +4120,15 @@ define <8 x i64> @stack_fold_pminsq_mask
}
define <8 x i64> @stack_fold_pminsq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pminsq_maskz:
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2255,8 +4138,15 @@ define <8 x i64> @stack_fold_pminsq_mask
}
define <8 x i64> @stack_fold_pminsq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2266,8 +4156,14 @@ define <8 x i64> @stack_fold_pminsq_mask
}
define <32 x i16> @stack_fold_pminsw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminsw
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2275,8 +4171,14 @@ define <32 x i16> @stack_fold_pminsw(<32
}
define <32 x i16> @stack_fold_pminsw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminsw_commuted:
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2284,8 +4186,17 @@ define <32 x i16> @stack_fold_pminsw_com
}
define <32 x i16> @stack_fold_pminsw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsw_mask:
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2297,8 +4208,17 @@ define <32 x i16> @stack_fold_pminsw_mas
}
define <32 x i16> @stack_fold_pminsw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminsw_mask_commuted:
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2310,8 +4230,15 @@ define <32 x i16> @stack_fold_pminsw_mas
}
define <32 x i16> @stack_fold_pminsw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pminsw_maskz:
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2321,8 +4248,15 @@ define <32 x i16> @stack_fold_pminsw_mas
}
define <32 x i16> @stack_fold_pminsw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2332,8 +4266,14 @@ define <32 x i16> @stack_fold_pminsw_mas
}
define <64 x i8> @stack_fold_pminub(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminub
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2341,8 +4281,14 @@ define <64 x i8> @stack_fold_pminub(<64
}
define <64 x i8> @stack_fold_pminub_commuted(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminub_commuted:
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2350,8 +4296,17 @@ define <64 x i8> @stack_fold_pminub_comm
}
define <64 x i8> @stack_fold_pminub_mask(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminub_mask:
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2363,8 +4318,17 @@ define <64 x i8> @stack_fold_pminub_mask
}
define <64 x i8> @stack_fold_pminub_mask_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask, <64 x i8>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminub_mask_commuted:
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2376,8 +4340,15 @@ define <64 x i8> @stack_fold_pminub_mask
}
define <64 x i8> @stack_fold_pminub_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pminub_maskz:
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a0, %a1
%3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %a1
@@ -2387,8 +4358,15 @@ define <64 x i8> @stack_fold_pminub_mask
}
define <64 x i8> @stack_fold_pminub_maskz_commuted(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pminub_maskz_commuted:
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <64 x i8> %a1, %a0
%3 = select <64 x i1> %2, <64 x i8> %a1, <64 x i8> %a0
@@ -2398,8 +4376,14 @@ define <64 x i8> @stack_fold_pminub_mask
}
define <16 x i32> @stack_fold_pminud(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminud
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2407,8 +4391,14 @@ define <16 x i32> @stack_fold_pminud(<16
}
define <16 x i32> @stack_fold_pminud_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminud_commuted:
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2416,8 +4406,17 @@ define <16 x i32> @stack_fold_pminud_com
}
define <16 x i32> @stack_fold_pminud_mask(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminud_mask:
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2429,8 +4428,17 @@ define <16 x i32> @stack_fold_pminud_mas
}
define <16 x i32> @stack_fold_pminud_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminud_mask_commuted:
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2442,8 +4450,15 @@ define <16 x i32> @stack_fold_pminud_mas
}
define <16 x i32> @stack_fold_pminud_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pminud_maskz:
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a0, %a1
%3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %a1
@@ -2453,8 +4468,15 @@ define <16 x i32> @stack_fold_pminud_mas
}
define <16 x i32> @stack_fold_pminud_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pminud_maskz_commuted:
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i32> %a1, %a0
%3 = select <16 x i1> %2, <16 x i32> %a1, <16 x i32> %a0
@@ -2464,8 +4486,14 @@ define <16 x i32> @stack_fold_pminud_mas
}
define <8 x i64> @stack_fold_pminuq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminuq
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2473,8 +4501,14 @@ define <8 x i64> @stack_fold_pminuq(<8 x
}
define <8 x i64> @stack_fold_pminuq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminuq_commuted:
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2482,8 +4516,17 @@ define <8 x i64> @stack_fold_pminuq_comm
}
define <8 x i64> @stack_fold_pminuq_mask(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminuq_mask:
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2495,8 +4538,17 @@ define <8 x i64> @stack_fold_pminuq_mask
}
define <8 x i64> @stack_fold_pminuq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminuq_mask_commuted:
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2508,8 +4560,15 @@ define <8 x i64> @stack_fold_pminuq_mask
}
define <8 x i64> @stack_fold_pminuq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pminuq_maskz:
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a0, %a1
%3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %a1
@@ -2519,8 +4578,15 @@ define <8 x i64> @stack_fold_pminuq_mask
}
define <8 x i64> @stack_fold_pminuq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i64> %a1, %a0
%3 = select <8 x i1> %2, <8 x i64> %a1, <8 x i64> %a0
@@ -2530,8 +4596,14 @@ define <8 x i64> @stack_fold_pminuq_mask
}
define <32 x i16> @stack_fold_pminuw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminuw
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2539,8 +4611,14 @@ define <32 x i16> @stack_fold_pminuw(<32
}
define <32 x i16> @stack_fold_pminuw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminuw_commuted:
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2548,8 +4626,17 @@ define <32 x i16> @stack_fold_pminuw_com
}
define <32 x i16> @stack_fold_pminuw_mask(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminuw_mask:
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2561,8 +4648,17 @@ define <32 x i16> @stack_fold_pminuw_mas
}
define <32 x i16> @stack_fold_pminuw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask, <32 x i16>* %passthru) {
- ;CHECK-LABEL: stack_fold_pminuw_mask_commuted:
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm2
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2574,8 +4670,15 @@ define <32 x i16> @stack_fold_pminuw_mas
}
define <32 x i16> @stack_fold_pminuw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pminuw_maskz:
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a0, %a1
%3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %a1
@@ -2585,8 +4688,15 @@ define <32 x i16> @stack_fold_pminuw_mas
}
define <32 x i16> @stack_fold_pminuw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i16> %a1, %a0
%3 = select <32 x i1> %2, <32 x i16> %a1, <32 x i16> %a0
@@ -2596,8 +4706,15 @@ define <32 x i16> @stack_fold_pminuw_mas
}
define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovdb
- ;CHECK: vpmovdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovdb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -2605,8 +4722,14 @@ define <16 x i8> @stack_fold_vpmovdb(<16
declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovdw
- ;CHECK: vpmovdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i16> %1
@@ -2614,8 +4737,17 @@ define <16 x i16> @stack_fold_vpmovdw(<1
declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_movq_load
- ;CHECK: vmovq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_movq_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
; add forces execution domain
@@ -2624,8 +4756,14 @@ define <2 x i64> @stack_fold_movq_load(<
}
define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovqd
- ;CHECK: vpmovqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = trunc <8 x i64> %a0 to <8 x i32>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i32> %1
@@ -2633,8 +4771,15 @@ define <8 x i32> @stack_fold_vpmovqd(<8
declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovqw
- ;CHECK: vpmovqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -2642,8 +4787,14 @@ define <8 x i16> @stack_fold_vpmovqw(<8
declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovwb
- ;CHECK: vpmovwb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovwb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovwb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = trunc <32 x i16> %a0 to <32 x i8>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <32 x i8> %1
@@ -2651,8 +4802,15 @@ define <32 x i8> @stack_fold_vpmovwb(<32
declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsdb
- ;CHECK: vpmovsdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsdb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -2660,8 +4818,14 @@ define <16 x i8> @stack_fold_vpmovsdb(<1
declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsdw
- ;CHECK: vpmovsdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i16> %1
@@ -2669,8 +4833,14 @@ define <16 x i16> @stack_fold_vpmovsdw(<
declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsqd
- ;CHECK: vpmovsqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i32> %1
@@ -2678,8 +4848,15 @@ define <8 x i32> @stack_fold_vpmovsqd(<8
declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsqw
- ;CHECK: vpmovsqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -2687,8 +4864,14 @@ define <8 x i16> @stack_fold_vpmovsqw(<8
declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovswb
- ;CHECK: vpmovswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <32 x i8> %1
@@ -2696,16 +4879,28 @@ define <32 x i8> @stack_fold_vpmovswb(<3
declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbd_zmm
- ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <16 x i8> %a0 to <16 x i32>
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_pmovsxbq_zmm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbq_zmm
- ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = sext <8 x i8> %2 to <8 x i64>
@@ -2713,40 +4908,71 @@ define <8 x i64> @stack_fold_pmovsxbq_zm
}
define <32 x i16> @stack_fold_pmovsxbw_zmm(<32 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbw_zmm
- ;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <32 x i8> %a0 to <32 x i16>
ret <32 x i16> %2
}
define <8 x i64> @stack_fold_pmovsxdq_zmm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxdq_zmm
- ;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxdq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <8 x i32> %a0 to <8 x i64>
ret <8 x i64> %2
}
define <16 x i32> @stack_fold_pmovsxwd_zmm(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwd_zmm
- ;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <16 x i16> %a0 to <16 x i32>
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_pmovsxwq_zmm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwq_zmm
- ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <8 x i16> %a0 to <8 x i64>
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pmovsxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm
- ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq_mask_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <8 x i16> %a0 to <8 x i64>
%3 = bitcast i8 %mask to <8 x i1>
@@ -2755,8 +4981,15 @@ define <8 x i64> @stack_fold_pmovsxwq_ma
}
define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm
- ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq_maskz_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <8 x i16> %a0 to <8 x i64>
%3 = bitcast i8 %mask to <8 x i1>
@@ -2765,8 +4998,15 @@ define <8 x i64> @stack_fold_pmovsxwq_ma
}
define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusdb
- ;CHECK: vpmovusdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusdb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusdb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -2774,8 +5014,14 @@ define <16 x i8> @stack_fold_vpmovusdb(<
declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusdw
- ;CHECK: vpmovusdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusdw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i16> %1
@@ -2783,8 +5029,14 @@ define <16 x i16> @stack_fold_vpmovusdw(
declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusqd
- ;CHECK: vpmovusqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusqd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i32> %1
@@ -2792,8 +5044,15 @@ define <8 x i32> @stack_fold_vpmovusqd(<
declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusqw
- ;CHECK: vpmovusqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusqw %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -2801,8 +5060,14 @@ define <8 x i16> @stack_fold_vpmovusqw(<
declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovuswb
- ;CHECK: vpmovuswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovuswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovuswb %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <32 x i8> %1
@@ -2810,16 +5075,30 @@ define <32 x i8> @stack_fold_vpmovuswb(<
declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbd_zmm
- ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <16 x i8> %a0 to <16 x i32>
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_pmovzxbq_zmm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbq_zmm
- ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = zext <8 x i8> %2 to <8 x i64>
@@ -2827,40 +5106,76 @@ define <8 x i64> @stack_fold_pmovzxbq_zm
}
define <32 x i16> @stack_fold_pmovzxbw_zmm(<32 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbw_zmm
- ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <32 x i8> %a0 to <32 x i16>
ret <32 x i16> %2
}
define <8 x i64> @stack_fold_pmovzxdq_zmm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxdq_zmm
- ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxdq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i32> %a0 to <8 x i64>
ret <8 x i64> %2
}
define <16 x i32> @stack_fold_pmovzxwd_zmm(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwd_zmm
- ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 32-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <16 x i16> %a0 to <16 x i32>
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_pmovzxwq_zmm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_zmm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i64>
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pmovzxwq_mask_zmm(<8 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_mask_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i64>
%3 = bitcast i8 %mask to <8 x i1>
@@ -2869,8 +5184,16 @@ define <8 x i64> @stack_fold_pmovzxwq_ma
}
define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_maskz_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i64>
%3 = bitcast i8 %mask to <8 x i1>
@@ -2879,24 +5202,45 @@ define <8 x i64> @stack_fold_pmovzxwq_ma
}
define <16 x i32> @stack_fold_pmulld(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmulld
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pmulld_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmulld_commuted:
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a1, %a0
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pmulld_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmulld_mask
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -2907,8 +5251,17 @@ define <16 x i32> @stack_fold_pmulld_mas
}
define <16 x i32> @stack_fold_pmulld_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmulld_mask_commuted:
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -2919,8 +5272,15 @@ define <16 x i32> @stack_fold_pmulld_mas
}
define <16 x i32> @stack_fold_pmulld_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmulld_maskz
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -2929,8 +5289,15 @@ define <16 x i32> @stack_fold_pmulld_mas
}
define <16 x i32> @stack_fold_pmulld_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
- ;CHECK: vpmulld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -2939,24 +5306,45 @@ define <16 x i32> @stack_fold_pmulld_mas
}
define <8 x i64> @stack_fold_pmullq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmullq
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pmullq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmullq_commuted:
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a1, %a0
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pmullq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmullq_mask
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -2967,8 +5355,17 @@ define <8 x i64> @stack_fold_pmullq_mask
}
define <8 x i64> @stack_fold_pmullq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmullq_mask_commuted:
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -2979,8 +5376,15 @@ define <8 x i64> @stack_fold_pmullq_mask
}
define <8 x i64> @stack_fold_pmullq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmullq_maskz
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -2989,8 +5393,15 @@ define <8 x i64> @stack_fold_pmullq_mask
}
define <8 x i64> @stack_fold_pmullq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
- ;CHECK: vpmullq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmullq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -2999,24 +5410,45 @@ define <8 x i64> @stack_fold_pmullq_mask
}
define <32 x i16> @stack_fold_pmullw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmullw
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a0, %a1
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_pmullw_commuted(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmullw_commuted:
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a1, %a0
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_pmullw_mask(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmullw_mask
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -3027,8 +5459,17 @@ define <32 x i16> @stack_fold_pmullw_mas
}
define <32 x i16> @stack_fold_pmullw_mask_commuted(<32 x i16> %a0, <32 x i16> %a1, <32 x i16>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmullw_mask_commuted:
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
@@ -3039,8 +5480,15 @@ define <32 x i16> @stack_fold_pmullw_mas
}
define <32 x i16> @stack_fold_pmullw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmullw_maskz
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -3049,8 +5497,15 @@ define <32 x i16> @stack_fold_pmullw_mas
}
define <32 x i16> @stack_fold_pmullw_maskz_commuted(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
- ;CHECK: vpmullw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = mul <32 x i16> %a1, %a0
%3 = bitcast i32 %mask to <32 x i1>
@@ -3059,8 +5514,14 @@ define <32 x i16> @stack_fold_pmullw_mas
}
define <8 x i64> @stack_fold_pmuldq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmuldq
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3071,8 +5532,14 @@ define <8 x i64> @stack_fold_pmuldq(<8 x
}
define <8 x i64> @stack_fold_pmuldq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmuldq_commuted:
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3083,8 +5550,17 @@ define <8 x i64> @stack_fold_pmuldq_comm
}
define <8 x i64> @stack_fold_pmuldq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuldq_mask
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3099,8 +5575,17 @@ define <8 x i64> @stack_fold_pmuldq_mask
}
define <8 x i64> @stack_fold_pmuldq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3115,8 +5600,15 @@ define <8 x i64> @stack_fold_pmuldq_mask
}
define <8 x i64> @stack_fold_pmuldq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuldq_maskz
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3129,8 +5621,15 @@ define <8 x i64> @stack_fold_pmuldq_mask
}
define <8 x i64> @stack_fold_pmuldq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shl <8 x i64> %a0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
%3 = ashr <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
@@ -3146,8 +5645,14 @@ define <8 x i64> @stack_fold_pmuldq_mask
define <8 x i64> @stack_fold_pmuludq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmuludq
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3156,8 +5661,14 @@ define <8 x i64> @stack_fold_pmuludq(<8
}
define <8 x i64> @stack_fold_pmuludq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmuludq_commuted:
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3166,8 +5677,17 @@ define <8 x i64> @stack_fold_pmuludq_com
}
define <8 x i64> @stack_fold_pmuludq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_mask
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3180,8 +5700,17 @@ define <8 x i64> @stack_fold_pmuludq_mas
}
define <8 x i64> @stack_fold_pmuludq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3194,8 +5723,15 @@ define <8 x i64> @stack_fold_pmuludq_mas
}
define <8 x i64> @stack_fold_pmuludq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_maskz
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3206,8 +5742,15 @@ define <8 x i64> @stack_fold_pmuludq_mas
}
define <8 x i64> @stack_fold_pmuludq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = and <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%3 = and <8 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -3218,8 +5761,14 @@ define <8 x i64> @stack_fold_pmuludq_mas
}
define <16 x i32> @stack_fold_vpopcntd(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntd
- ;CHECK: vpopcntd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a0)
ret <16 x i32> %2
@@ -3227,8 +5776,14 @@ define <16 x i32> @stack_fold_vpopcntd(<
declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readonly
define <8 x i64> @stack_fold_vpopcntq(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntq
- ;CHECK: vpopcntq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a0)
ret <8 x i64> %2
@@ -3236,24 +5791,45 @@ define <8 x i64> @stack_fold_vpopcntq(<8
declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
define <16 x i32> @stack_fold_pord(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pord
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pord_commuted:
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a1, %a0
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pord_mask
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -3264,8 +5840,17 @@ define <16 x i32> @stack_fold_pord_mask(
}
define <16 x i32> @stack_fold_pord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pord_mask_commuted:
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -3276,8 +5861,15 @@ define <16 x i32> @stack_fold_pord_mask_
}
define <16 x i32> @stack_fold_pord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pord_maskz
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -3286,8 +5878,15 @@ define <16 x i32> @stack_fold_pord_maskz
}
define <16 x i32> @stack_fold_pord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pord_maskz_commuted:
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pord_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -3296,24 +5895,45 @@ define <16 x i32> @stack_fold_pord_maskz
}
define <8 x i64> @stack_fold_porq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_porq
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_porq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_porq_commuted:
- ;CHECK: vorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a1, %a0
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_porq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_porq_mask
- ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -3324,8 +5944,17 @@ define <8 x i64> @stack_fold_porq_mask(<
}
define <8 x i64> @stack_fold_porq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_porq_mask_commuted:
- ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -3336,8 +5965,15 @@ define <8 x i64> @stack_fold_porq_mask_c
}
define <8 x i64> @stack_fold_porq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_porq_maskz
- ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -3346,8 +5982,15 @@ define <8 x i64> @stack_fold_porq_maskz(
}
define <8 x i64> @stack_fold_porq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_porq_maskz_commuted:
- ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_porq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = or <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -3356,8 +5999,14 @@ define <8 x i64> @stack_fold_porq_maskz_
}
define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psadbw
- ;CHECK: vpsadbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psadbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
ret <8 x i64> %2
@@ -3365,8 +6014,14 @@ define <8 x i64> @stack_fold_psadbw(<64
declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pshufb_zmm
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
@@ -3374,8 +6029,17 @@ define <64 x i8> @stack_fold_pshufb_zmm(
declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_zmm_mask
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <64 x i8>, <64 x i8>* %passthru
%3 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
@@ -3385,8 +6049,15 @@ define <64 x i8> @stack_fold_pshufb_zmm_
}
define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_zmm_maskz
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1)
%3 = bitcast i64 %mask to <64 x i1>
@@ -3395,8 +6066,17 @@ define <64 x i8> @stack_fold_pshufb_zmm_
}
define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pshufd_zmm
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
%3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -3404,8 +6084,22 @@ define <16 x i32> @stack_fold_pshufd_zmm
}
define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_zmm_mask
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
%3 = bitcast i16 %mask to <16 x i1>
@@ -3414,8 +6108,16 @@ define <16 x i32> @stack_fold_pshufd_zmm
}
define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_zmm_maskz
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
%3 = bitcast i16 %mask to <16 x i1>
@@ -3424,16 +6126,37 @@ define <16 x i32> @stack_fold_pshufd_zmm
}
define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshufhw_zmm
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_zmm_mask
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
%3 = bitcast i32 %mask to <32 x i1>
@@ -3442,8 +6165,16 @@ define <32 x i16> @stack_fold_pshufhw_zm
}
define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_zmm_maskz
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12,16,17,18,19,23,22,21,20,24,25,26,27,31,30,29,28]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 23, i32 22, i32 21, i32 20, i32 24, i32 25, i32 26, i32 27, i32 31, i32 30, i32 29, i32 28>
%3 = bitcast i32 %mask to <32 x i1>
@@ -3452,16 +6183,37 @@ define <32 x i16> @stack_fold_pshufhw_zm
}
define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshuflw_zmm
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_zmm_mask
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_zmm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
%3 = bitcast i32 %mask to <32 x i1>
@@ -3470,8 +6222,16 @@ define <32 x i16> @stack_fold_pshuflw_zm
}
define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_zmm_maskz
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_zmm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15,19,18,17,16,20,21,22,23,27,26,25,24,28,29,30,31]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 18, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 26, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
%3 = bitcast i32 %mask to <32 x i1>
@@ -3480,8 +6240,14 @@ define <32 x i16> @stack_fold_pshuflw_zm
}
define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pslld
- ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
ret <16 x i32> %2
@@ -3489,8 +6255,17 @@ define <16 x i32> @stack_fold_pslld(<16
declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pslld_mask
- ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3500,8 +6275,15 @@ define <16 x i32> @stack_fold_pslld_mask
}
define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pslld_maskz
- ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3510,8 +6292,14 @@ define <16 x i32> @stack_fold_pslld_mask
}
define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pslldi
- ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
ret <16 x i32> %2
@@ -3519,8 +6307,17 @@ define <16 x i32> @stack_fold_pslldi(<16
declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pslldi_mask
- ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldi_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3530,8 +6327,15 @@ define <16 x i32> @stack_fold_pslldi_mas
}
define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pslldi_maskz
- ;CHECK: vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldi_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpslld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3540,16 +6344,29 @@ define <16 x i32> @stack_fold_pslldi_mas
}
define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
- ;CHECK-LABEL: stack_fold_pslldq
- ;CHECK: vpslldq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslldq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = zero,mem[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,mem[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,mem[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,mem[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
ret <64 x i8> %2
}
define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllq
- ;CHECK: vpsllq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
ret <8 x i64> %2
@@ -3557,8 +6374,14 @@ define <8 x i64> @stack_fold_psllq(<8 x
declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_psllqi
- ;CHECK: vpsllq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllqi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
ret <8 x i64> %2
@@ -3566,8 +6389,14 @@ define <8 x i64> @stack_fold_psllqi(<8 x
declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psllvd
- ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
ret <16 x i32> %2
@@ -3575,8 +6404,17 @@ define <16 x i32> @stack_fold_psllvd(<16
declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_psllvd_mask
- ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3586,8 +6424,15 @@ define <16 x i32> @stack_fold_psllvd_mas
}
define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_psllvd_maskz
- ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -3596,8 +6441,14 @@ define <16 x i32> @stack_fold_psllvd_mas
}
define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllvq
- ;CHECK: vpsllvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
ret <8 x i64> %2
@@ -3605,8 +6456,14 @@ define <8 x i64> @stack_fold_psllvq(<8 x
declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllvw
- ;CHECK: vpsllvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
@@ -3614,8 +6471,14 @@ define <32 x i16> @stack_fold_psllvw(<32
declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllw
- ;CHECK: vpsllw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
ret <32 x i16> %2
@@ -3623,8 +6486,14 @@ define <32 x i16> @stack_fold_psllw(<32
declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_psllwi
- ;CHECK: vpsllw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllwi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
ret <32 x i16> %2
@@ -3632,8 +6501,14 @@ define <32 x i16> @stack_fold_psllwi(<32
declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrad
- ;CHECK: vpsrad {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrad:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
ret <16 x i32> %2
@@ -3641,8 +6516,14 @@ define <16 x i32> @stack_fold_psrad(<16
declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_psradi
- ;CHECK: vpsrad $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psradi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrad $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
ret <16 x i32> %2
@@ -3650,8 +6531,14 @@ define <16 x i32> @stack_fold_psradi(<16
declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psraq
- ;CHECK: vpsraq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
ret <8 x i64> %2
@@ -3659,8 +6546,14 @@ define <8 x i64> @stack_fold_psraq(<8 x
declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_psraqi
- ;CHECK: vpsraq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraqi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
ret <8 x i64> %2
@@ -3668,8 +6561,14 @@ define <8 x i64> @stack_fold_psraqi(<8 x
declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psravd
- ;CHECK: vpsravd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
ret <16 x i32> %2
@@ -3677,8 +6576,14 @@ define <16 x i32> @stack_fold_psravd(<16
declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psravq
- ;CHECK: vpsravq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
ret <8 x i64> %2
@@ -3686,8 +6591,14 @@ define <8 x i64> @stack_fold_psravq(<8 x
declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psravw
- ;CHECK: vpsravw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
@@ -3695,8 +6606,14 @@ define <32 x i16> @stack_fold_psravw(<32
declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psraw
- ;CHECK: vpsraw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
ret <32 x i16> %2
@@ -3704,8 +6621,14 @@ define <32 x i16> @stack_fold_psraw(<32
declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_psrawi
- ;CHECK: vpsraw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrawi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
ret <32 x i16> %2
@@ -3713,8 +6636,14 @@ define <32 x i16> @stack_fold_psrawi(<32
declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrld
- ;CHECK: vpsrld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
ret <16 x i32> %2
@@ -3722,8 +6651,14 @@ define <16 x i32> @stack_fold_psrld(<16
declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_psrldi
- ;CHECK: vpsrld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrldi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrld $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
ret <16 x i32> %2
@@ -3731,16 +6666,29 @@ define <16 x i32> @stack_fold_psrldi(<16
declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
- ;CHECK-LABEL: stack_fold_psrldq
- ;CHECK: vpsrldq $2, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrldq $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = mem[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,mem[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,mem[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,mem[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
ret <64 x i8> %2
}
define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlq
- ;CHECK: vpsrlq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
ret <8 x i64> %2
@@ -3748,8 +6696,14 @@ define <8 x i64> @stack_fold_psrlq(<8 x
declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_psrlqi
- ;CHECK: vpsrlq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlqi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlq $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
ret <8 x i64> %2
@@ -3757,8 +6711,14 @@ define <8 x i64> @stack_fold_psrlqi(<8 x
declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvd
- ;CHECK: vpsrlvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
ret <16 x i32> %2
@@ -3766,8 +6726,14 @@ define <16 x i32> @stack_fold_psrlvd(<16
declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvq
- ;CHECK: vpsrlvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
ret <8 x i64> %2
@@ -3775,8 +6741,14 @@ define <8 x i64> @stack_fold_psrlvq(<8 x
declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvw
- ;CHECK: vpsrlvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
@@ -3784,8 +6756,14 @@ define <32 x i16> @stack_fold_psrlvw(<32
declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlw
- ;CHECK: vpsrlw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
ret <32 x i16> %2
@@ -3793,8 +6771,14 @@ define <32 x i16> @stack_fold_psrlw(<32
declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_psrlwi
- ;CHECK: vpsrlw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlwi:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlw $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
ret <32 x i16> %2
@@ -3802,80 +6786,157 @@ define <32 x i16> @stack_fold_psrlwi(<32
declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubb
- ;CHECK: vpsubb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <64 x i8> %a0, %a1
ret <64 x i8> %2
}
define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psubd
- ;CHECK: vpsubd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psubq
- ;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubsb
- ;CHECK: vpsubsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
}
define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubsw
- ;CHECK: vpsubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
}
define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubusb
- ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> %a0, <64 x i8> %a1)
ret <64 x i8> %2
}
define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubusw
- ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> %a0, <32 x i16> %a1)
ret <32 x i16> %2
}
define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubw
- ;CHECK: vpsubw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <32 x i16> %a0, %a1
ret <32 x i16> %2
}
define <8 x i64> @stack_fold_shufi64x2(<8 x i64> %a, <8 x i64> %b) {
- ;CHECK-LABEL: stack_fold_shufi64x2
- ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi64x2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_shufi64x2_mask(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_shufi64x2_mask
- ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi64x2_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,4,5],mem[2,3,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%3 = bitcast i8 %mask to <8 x i1>
@@ -3886,8 +6947,22 @@ define <8 x i64> @stack_fold_shufi64x2_m
}
define <8 x i64> @stack_fold_shufi64x2_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask, <8 x i64>* %passthru) {
- ;CHECK-LABEL: stack_fold_shufi64x2_maskz
- ;CHECK: vshufi64x2 $24, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi64x2_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vshufi64x2 $24, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,4,5],mem[2,3,0,1]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
%3 = bitcast i8 %mask to <8 x i1>
@@ -3896,8 +6971,24 @@ define <8 x i64> @stack_fold_shufi64x2_m
}
define <16 x i32> @stack_fold_shufi32x4_mask(<16 x i32> %a, <16 x i32> %b, i16 %mask, <16 x i32>* %passthru) {
- ;CHECK-LABEL: stack_fold_shufi32x4_mask
- ;CHECK: vshufi32x4 $20, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi32x4_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 (%rsi), %zmm1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
%3 = bitcast i16 %mask to <16 x i1>
@@ -3908,8 +6999,22 @@ define <16 x i32> @stack_fold_shufi32x4_
}
define <16 x i32> @stack_fold_shufi32x4_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
- ;CHECK-LABEL: stack_fold_shufi32x4_maskz
- ;CHECK: vshufi32x4 $20, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi32x4_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; CHECK-NEXT: vshufi32x4 $20, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],mem[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
%3 = bitcast i16 %mask to <16 x i1>
@@ -3918,8 +7023,14 @@ define <16 x i32> @stack_fold_shufi32x4_
}
define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_ternlogd
- ;CHECK: vpternlogd $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_ternlogd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpternlogd $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
ret <16 x i32> %2
@@ -3927,8 +7038,14 @@ define <16 x i32> @stack_fold_ternlogd(<
declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_ternlogq
- ;CHECK: vpternlogq $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_ternlogq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpternlogq $33, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
ret <8 x i64> %2
@@ -3937,16 +7054,33 @@ define <8 x i64> @stack_fold_ternlogq(<8
declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhbw_zmm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
ret <64 x i8> %2
}
define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_mask_zmm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_mask_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm2 {%k1} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
%3 = bitcast i64 %mask to <64 x i1>
@@ -3957,8 +7091,16 @@ define <64 x i8> @stack_fold_punpckhbw_m
}
define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: # zmm0 {%k1} {z} = zmm0[8],mem[8],zmm0[9],mem[9],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[12],mem[12],zmm0[13],mem[13],zmm0[14],mem[14],zmm0[15],mem[15],zmm0[24],mem[24],zmm0[25],mem[25],zmm0[26],mem[26],zmm0[27],mem[27],zmm0[28],mem[28],zmm0[29],mem[29],zmm0[30],mem[30],zmm0[31],mem[31],zmm0[40],mem[40],zmm0[41],mem[41],zmm0[42],mem[42],zmm0[43],mem[43],zmm0[44],mem[44],zmm0[45],mem[45],zmm0[46],mem[46],zmm0[47],mem[47],zmm0[56],mem[56],zmm0[57],mem[57],zmm0[58],mem[58],zmm0[59],mem[59],zmm0[60],mem[60],zmm0[61],mem[61],zmm0[62],mem[62],zmm0[63],mem[63]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
%3 = bitcast i64 %mask to <64 x i1>
@@ -3967,24 +7109,45 @@ define <64 x i8> @stack_fold_punpckhbw_m
}
define <16 x i32> @stack_fold_pxord(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pxord
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a0, %a1
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pxord_commuted(<16 x i32> %a0, <16 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pxord_commuted:
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a1, %a0
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_pxord_mask(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pxord_mask
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -3995,8 +7158,17 @@ define <16 x i32> @stack_fold_pxord_mask
}
define <16 x i32> @stack_fold_pxord_mask_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pxord_mask_commuted:
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovaps %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -4007,8 +7179,15 @@ define <16 x i32> @stack_fold_pxord_mask
}
define <16 x i32> @stack_fold_pxord_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pxord_maskz
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -4017,8 +7196,15 @@ define <16 x i32> @stack_fold_pxord_mask
}
define <16 x i32> @stack_fold_pxord_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pxord_maskz_commuted:
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxord_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <16 x i32> %a1, %a0
%3 = bitcast i16 %mask to <16 x i1>
@@ -4027,24 +7213,45 @@ define <16 x i32> @stack_fold_pxord_mask
}
define <8 x i64> @stack_fold_pxorq(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pxorq
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a0, %a1
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pxorq_commuted(<8 x i64> %a0, <8 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pxorq_commuted:
- ;CHECK: vxorps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vxorps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a1, %a0
ret <8 x i64> %2
}
define <8 x i64> @stack_fold_pxorq_mask(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pxorq_mask
- ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -4055,8 +7262,17 @@ define <8 x i64> @stack_fold_pxorq_mask(
}
define <8 x i64> @stack_fold_pxorq_mask_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64>* %a2, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pxorq_mask_commuted:
- ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: vmovapd %zmm0, %zmm1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm0},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
@@ -4067,8 +7283,15 @@ define <8 x i64> @stack_fold_pxorq_mask_
}
define <8 x i64> @stack_fold_pxorq_maskz(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pxorq_maskz
- ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a0, %a1
%3 = bitcast i8 %mask to <8 x i1>
@@ -4077,8 +7300,15 @@ define <8 x i64> @stack_fold_pxorq_maskz
}
define <8 x i64> @stack_fold_pxorq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
- ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxorq_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vxorpd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = xor <8 x i64> %a1, %a0
%3 = bitcast i8 %mask to <8 x i1>
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll?rev=369876&r1=369875&r2=369876&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll Sun Aug 25 13:48:14 2019
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,16 +10,37 @@ target triple = "x86_64-unknown-unknown"
; relevant registers and check that the reload is correctly folded into the instruction.
define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) {
- ;CHECK-LABEL: stack_fold_valignd_ymm
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7],ymm0[0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
ret <8 x i32> %2
}
define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) {
- ;CHECK-LABEL: stack_fold_valignd_ymm_mask
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7],ymm0[0]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%3 = bitcast i8 %mask to <8 x i1>
@@ -28,8 +50,18 @@ define <8 x i32> @stack_fold_valignd_ymm
}
define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: stack_fold_valignd_ymm_maskz
- ;CHECK: valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignd_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: valignd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7],ymm0[0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%3 = bitcast i8 %mask to <8 x i1>
@@ -38,16 +70,31 @@ define <8 x i32> @stack_fold_valignd_ymm
}
define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) {
- ;CHECK-LABEL: stack_fold_valignq_ymm
- ;CHECK: valignq $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_valignq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: valignq $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[1,2,3],ymm0[0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x i64> %2
}
define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pavgb
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <16 x i8> %a0 to <16 x i16>
%3 = zext <16 x i8> %a1 to <16 x i16>
@@ -59,8 +106,14 @@ define <16 x i8> @stack_fold_pavgb(<16 x
}
define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pavgb_ymm
- ;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <32 x i8> %a0 to <32 x i16>
%3 = zext <32 x i8> %a1 to <32 x i16>
@@ -72,8 +125,14 @@ define <32 x i8> @stack_fold_pavgb_ymm(<
}
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pavgw
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i32>
%3 = zext <8 x i16> %a1 to <8 x i32>
@@ -85,8 +144,14 @@ define <8 x i16> @stack_fold_pavgw(<8 x
}
define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pavgw_ymm
- ;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <16 x i16> %a0 to <16 x i32>
%3 = zext <16 x i16> %a1 to <16 x i32>
@@ -98,40 +163,72 @@ define <16 x i16> @stack_fold_pavgw_ymm(
}
define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictd
- ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.conflict.d.128(<4 x i32> %a0)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpconflictd_ymm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictd_ymm
- ;CHECK: vpconflictd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.conflict.d.256(<8 x i32> %a0)
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_vpconflictq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictq
- ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx512.conflict.q.128(<2 x i64> %a0)
ret <2 x i64> %2
}
define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpconflictq_ymm
- ;CHECK: vpconflictq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpconflictq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpconflictq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.conflict.q.256(<4 x i64> %a0)
ret <4 x i64> %2
}
define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_extracti32x4
- ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti32x4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <8 x i16> %a0 to <8 x i32>
%2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -140,8 +237,16 @@ define <4 x i32> @stack_fold_extracti32x
}
define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_extracti64x2
- ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_extracti64x2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
; zext forces execution domain
%1 = zext <4 x i32> %a0 to <4 x i64>
%2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
@@ -150,8 +255,17 @@ define <2 x i64> @stack_fold_extracti64x
}
define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_inserti32x4
- ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_inserti32x4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; add forces execution domain
@@ -160,8 +274,17 @@ define <8 x i32> @stack_fold_inserti32x4
}
define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_inserti64x2
- ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_inserti64x2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; add forces execution domain
@@ -170,8 +293,14 @@ define <4 x i64> @stack_fold_inserti64x2
}
define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pabsb
- ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, zeroinitializer
%3 = sub <16 x i8> zeroinitializer, %a0
@@ -180,8 +309,14 @@ define <16 x i8> @stack_fold_pabsb(<16 x
}
define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pabsb_ymm
- ;CHECK: vpabsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i8> %a0, zeroinitializer
%3 = sub <32 x i8> zeroinitializer, %a0
@@ -190,8 +325,14 @@ define <32 x i8> @stack_fold_pabsb_ymm(<
}
define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pabsd
- ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <4 x i32> %a0, zeroinitializer
%3 = sub <4 x i32> zeroinitializer, %a0
@@ -200,8 +341,14 @@ define <4 x i32> @stack_fold_pabsd(<4 x
}
define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pabsd_ymm
- ;CHECK: vpabsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i32> %a0, zeroinitializer
%3 = sub <8 x i32> zeroinitializer, %a0
@@ -210,8 +357,14 @@ define <8 x i32> @stack_fold_pabsd_ymm(<
}
define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_pabsq
- ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <2 x i64> %a0, zeroinitializer
%3 = sub <2 x i64> zeroinitializer, %a0
@@ -220,8 +373,14 @@ define <2 x i64> @stack_fold_pabsq(<2 x
}
define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_pabsq_ymm
- ;CHECK: vpabsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <4 x i64> %a0, zeroinitializer
%3 = sub <4 x i64> zeroinitializer, %a0
@@ -230,8 +389,14 @@ define <4 x i64> @stack_fold_pabsq_ymm(<
}
define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pabsw
- ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, zeroinitializer
%3 = sub <8 x i16> zeroinitializer, %a0
@@ -240,8 +405,14 @@ define <8 x i16> @stack_fold_pabsw(<8 x
}
define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pabsw_ymm
- ;CHECK: vpabsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i16> %a0, zeroinitializer
%3 = sub <16 x i16> zeroinitializer, %a0
@@ -250,8 +421,14 @@ define <16 x i16> @stack_fold_pabsw_ymm(
}
define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packssdw
- ;CHECK: vpackssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packssdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
ret <8 x i16> %2
@@ -259,8 +436,14 @@ define <8 x i16> @stack_fold_packssdw(<4
declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packssdw_ymm
- ;CHECK: vpackssdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_packssdw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
ret <16 x i16> %2
@@ -268,8 +451,14 @@ define <16 x i16> @stack_fold_packssdw_y
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packsswb
- ;CHECK: vpacksswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packsswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
ret <16 x i8> %2
@@ -277,8 +466,14 @@ define <16 x i8> @stack_fold_packsswb(<8
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packsswb_ymm
- ;CHECK: vpacksswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_packsswb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
ret <32 x i8> %2
@@ -286,8 +481,14 @@ define <32 x i8> @stack_fold_packsswb_ym
declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packusdw
- ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
ret <8 x i16> %2
@@ -295,8 +496,14 @@ define <8 x i16> @stack_fold_packusdw(<4
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packusdw_ymm
- ;CHECK: vpackusdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
ret <16 x i16> %2
@@ -304,8 +511,14 @@ define <16 x i16> @stack_fold_packusdw_y
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packuswb
- ;CHECK: vpackuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packuswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
ret <16 x i8> %2
@@ -313,8 +526,14 @@ define <16 x i8> @stack_fold_packuswb(<8
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packuswb_ymm
- ;CHECK: vpackuswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_packuswb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
ret <32 x i8> %2
@@ -322,16 +541,31 @@ define <32 x i8> @stack_fold_packuswb_ym
declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddb
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i8> %a0, %a1
ret <16 x i8> %2
}
define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_mask
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i8> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -342,8 +576,15 @@ define <16 x i8> @stack_fold_paddb_mask(
}
define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_maskz
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i8> %a0, %a1
%3 = bitcast i16 %mask to <16 x i1>
@@ -352,16 +593,31 @@ define <16 x i8> @stack_fold_paddb_maskz
}
define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddb_ymm
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i8> %a0, %a1
ret <32 x i8> %2
}
define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_mask_ymm
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_mask_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i8> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -372,8 +628,15 @@ define <32 x i8> @stack_fold_paddb_mask_
}
define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_paddb_maskz_ymm
- ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb_maskz_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <32 x i8> %a0, %a1
%3 = bitcast i32 %mask to <32 x i1>
@@ -382,40 +645,70 @@ define <32 x i8> @stack_fold_paddb_maskz
}
define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_paddd
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <4 x i32> %a0, %a1
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_paddd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_paddd_ymm
- ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i32> %a0, %a1
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_paddq
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <2 x i64> %a0, %a1
ret <2 x i64> %2
}
define <4 x i64> @stack_fold_paddq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_paddq_ymm
- ;CHECK: vpaddq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <4 x i64> %a0, %a1
ret <4 x i64> %2
}
define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddsb
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -423,8 +716,14 @@ define <16 x i8> @stack_fold_paddsb(<16
declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_paddsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddsb_ymm
- ;CHECK: vpaddsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
@@ -432,8 +731,14 @@ define <32 x i8> @stack_fold_paddsb_ymm(
declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddsw
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -441,8 +746,14 @@ define <8 x i16> @stack_fold_paddsw(<8 x
declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_paddsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddsw_ymm
- ;CHECK: vpaddsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -450,8 +761,14 @@ define <16 x i16> @stack_fold_paddsw_ymm
declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddusb
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -459,8 +776,14 @@ define <16 x i8> @stack_fold_paddusb(<16
declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_paddusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddusb_ymm
- ;CHECK: vpaddusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
@@ -468,8 +791,14 @@ define <32 x i8> @stack_fold_paddusb_ymm
declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddusw
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -477,8 +806,14 @@ define <8 x i16> @stack_fold_paddusw(<8
declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_paddusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddusw_ymm
- ;CHECK: vpaddusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -486,32 +821,65 @@ define <16 x i16> @stack_fold_paddusw_ym
declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddw
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <8 x i16> %a0, %a1
ret <8 x i16> %2
}
define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddw_ymm
- ;CHECK: vpaddw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = add <16 x i16> %a0, %a1
ret <16 x i16> %2
}
define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_palignr
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
ret <32 x i8> %2
}
define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) {
- ;CHECK-LABEL: stack_fold_palignr_mask
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm1 {%k1} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
%3 = bitcast i32 %mask to <32 x i1>
@@ -521,8 +889,18 @@ define <32 x i8> @stack_fold_palignr_mas
}
define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_palignr_maskz
- ;CHECK: vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
%3 = bitcast i32 %mask to <32 x i1>
@@ -531,8 +909,16 @@ define <32 x i8> @stack_fold_palignr_mas
}
define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqb
- ;CHECK: vpcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <16 x i8> %a0, %a1
%3 = bitcast <16 x i1> %2 to i16
@@ -540,8 +926,16 @@ define i16 @stack_fold_pcmpeqb(<16 x i8>
}
define i8 @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqd
- ;CHECK: vpcmpeqd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <4 x i32> %a0, %a1
%3 = shufflevector <4 x i1> %2, <4 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -550,8 +944,16 @@ define i8 @stack_fold_pcmpeqd(<4 x i32>
}
define i8 @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqq
- ;CHECK: vpcmpeqq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <2 x i64> %a0, %a1
%3 = shufflevector <2 x i1> %2, <2 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
@@ -560,8 +962,16 @@ define i8 @stack_fold_pcmpeqq(<2 x i64>
}
define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqw
- ;CHECK: vpcmpeqw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 # 16-byte Folded Reload
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp eq <8 x i16> %a0, %a1
%3 = bitcast <8 x i1> %2 to i8
@@ -569,8 +979,18 @@ define i8 @stack_fold_pcmpeqw(<8 x i16>
}
define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_permbvar
- ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_permbvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpermb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0)
; add forces execution domain
@@ -580,8 +1000,16 @@ define <32 x i8> @stack_fold_permbvar(<3
declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) nounwind readonly
define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_permd
- ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_permd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
; add forces execution domain
@@ -591,72 +1019,129 @@ define <8 x i32> @stack_fold_permd(<8 x
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2b
- ;CHECK: vpermi2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2)
ret <16 x i8> %2
}
define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2b_ymm
- ;CHECK: vpermi2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2b_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2)
ret <32 x i8> %2
}
define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2d
- ;CHECK: vpermi2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2d_ymm
- ;CHECK: vpermi2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2d_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2q
- ;CHECK: vpermi2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2q:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
ret <2 x i64> %2
}
define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2q_ymm
- ;CHECK: vpermi2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2q_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
ret <4 x i64> %2
}
define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2w
- ;CHECK: vpermi2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2)
ret <8 x i16> %2
}
define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermi2w_ymm
- ;CHECK: vpermi2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermi2w_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermi2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2)
ret <16 x i16> %2
}
define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_permq
- ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_permq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[3,2,2,3]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
; add forces execution domain
@@ -665,8 +1150,18 @@ define <4 x i64> @stack_fold_permq(<4 x
}
define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_permqvar
- ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_permqvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpermq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0)
; add forces execution domain
@@ -676,8 +1171,14 @@ define <4 x i64> @stack_fold_permqvar(<4
declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) nounwind readonly
define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2b
- ;CHECK: vpermt2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
ret <16 x i8> %2
@@ -685,8 +1186,14 @@ define <16 x i8> @stack_fold_vpermt2b(<1
declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>)
define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2b_ymm
- ;CHECK: vpermt2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2b_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2b {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
ret <32 x i8> %2
@@ -694,8 +1201,14 @@ define <32 x i8> @stack_fold_vpermt2b_ym
declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>)
define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2d
- ;CHECK: vpermt2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
ret <4 x i32> %2
@@ -703,8 +1216,14 @@ define <4 x i32> @stack_fold_vpermt2d(<4
declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2d_ymm
- ;CHECK: vpermt2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2d_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %2
@@ -712,8 +1231,14 @@ define <8 x i32> @stack_fold_vpermt2d_ym
declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2q
- ;CHECK: vpermt2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2q:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
ret <2 x i64> %2
@@ -721,8 +1246,14 @@ define <2 x i64> @stack_fold_vpermt2q(<2
declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2q_ymm
- ;CHECK: vpermt2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2q_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
ret <4 x i64> %2
@@ -730,8 +1261,14 @@ define <4 x i64> @stack_fold_vpermt2q_ym
declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2w
- ;CHECK: vpermt2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <8 x i16> %2
@@ -739,8 +1276,14 @@ define <8 x i16> @stack_fold_vpermt2w(<8
declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>)
define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
- ;CHECK-LABEL: stack_fold_vpermt2w_ymm
- ;CHECK: vpermt2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpermt2w_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpermt2w {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <16 x i16> %2
@@ -748,8 +1291,18 @@ define <16 x i16> @stack_fold_vpermt2w_y
declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>)
define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_permwvar
- ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_permwvar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpermw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0)
; add forces execution domain
@@ -759,40 +1312,70 @@ define <16 x i16> @stack_fold_permwvar(<
declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) nounwind readonly
define <4 x i32> @stack_fold_vplzcntd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntd
- ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a0, i1 false)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vplzcntd_ymm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntd_ymm
- ;CHECK: vplzcntd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a0, i1 false)
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_vplzcntq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntq
- ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a0, i1 false)
ret <2 x i64> %2
}
define <4 x i64> @stack_fold_vplzcntq_ymm(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vplzcntq_ymm
- ;CHECK: vplzcntq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vplzcntq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vplzcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a0, i1 false)
ret <4 x i64> %2
}
define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddubsw
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
ret <8 x i16> %2
@@ -800,8 +1383,17 @@ define <8 x i16> @stack_fold_pmaddubsw(<
declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_mask
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -812,8 +1404,15 @@ define <8 x i16> @stack_fold_pmaddubsw_m
}
define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_maskz
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -822,8 +1421,14 @@ define <8 x i16> @stack_fold_pmaddubsw_m
}
define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_ymm
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
ret <16 x i16> %2
@@ -831,8 +1436,17 @@ define <16 x i16> @stack_fold_pmaddubsw_
declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -843,8 +1457,15 @@ define <16 x i16> @stack_fold_pmaddubsw_
}
define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz
- ;CHECK: vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -853,8 +1474,14 @@ define <16 x i16> @stack_fold_pmaddubsw_
}
define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddwd
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
ret <4 x i32> %2
@@ -862,8 +1489,17 @@ define <4 x i32> @stack_fold_pmaddwd(<8
declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_mask
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -875,8 +1511,15 @@ define <4 x i32> @stack_fold_pmaddwd_mas
}
define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_maskz
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -886,8 +1529,14 @@ define <4 x i32> @stack_fold_pmaddwd_mas
}
define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddwd_ymm
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
ret <8 x i32> %2
@@ -895,8 +1544,17 @@ define <8 x i32> @stack_fold_pmaddwd_ymm
declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_ymm_mask
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -907,8 +1565,15 @@ define <8 x i32> @stack_fold_pmaddwd_ymm
}
define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz
- ;CHECK: vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
%3 = bitcast i8 %mask to <8 x i1>
@@ -917,8 +1582,14 @@ define <8 x i32> @stack_fold_pmaddwd_ymm
}
define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsb
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -926,8 +1597,14 @@ define <16 x i8> @stack_fold_pmaxsb(<16
}
define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsb_ymm
- ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <32 x i8> %a0, %a1
%3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
@@ -935,8 +1612,14 @@ define <32 x i8> @stack_fold_pmaxsb_ymm(
}
define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsd
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -944,8 +1627,14 @@ define <4 x i32> @stack_fold_pmaxsd(<4 x
}
define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsd_ymm
- ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i32> %a0, %a1
%3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
@@ -953,8 +1642,14 @@ define <8 x i32> @stack_fold_pmaxsd_ymm(
}
define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsq
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <2 x i64> %a0, %a1
%3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
@@ -962,8 +1657,14 @@ define <2 x i64> @stack_fold_pmaxsq(<2 x
}
define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsq_ymm
- ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <4 x i64> %a0, %a1
%3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
@@ -971,8 +1672,14 @@ define <4 x i64> @stack_fold_pmaxsq_ymm(
}
define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsw
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -980,8 +1687,14 @@ define <8 x i16> @stack_fold_pmaxsw(<8 x
}
define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsw_ymm
- ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp sgt <16 x i16> %a0, %a1
%3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
@@ -989,8 +1702,14 @@ define <16 x i16> @stack_fold_pmaxsw_ymm
}
define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxub
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -998,8 +1717,14 @@ define <16 x i8> @stack_fold_pmaxub(<16
}
define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxub_ymm
- ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <32 x i8> %a0, %a1
%3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
@@ -1007,8 +1732,14 @@ define <32 x i8> @stack_fold_pmaxub_ymm(
}
define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxud
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -1016,8 +1747,14 @@ define <4 x i32> @stack_fold_pmaxud(<4 x
}
define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxud_ymm
- ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i32> %a0, %a1
%3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
@@ -1025,8 +1762,14 @@ define <8 x i32> @stack_fold_pmaxud_ymm(
}
define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuq
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <2 x i64> %a0, %a1
%3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
@@ -1034,8 +1777,17 @@ define <2 x i64> @stack_fold_pmaxuq(<2 x
}
define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_mask
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <2 x i64>, <2 x i64>* %passthru
%3 = icmp ugt <2 x i64> %a0, %a1
@@ -1047,8 +1799,15 @@ define <2 x i64> @stack_fold_pmaxuq_mask
}
define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_maskz
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <2 x i64> %a0, %a1
%3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
@@ -1059,8 +1818,14 @@ define <2 x i64> @stack_fold_pmaxuq_mask
}
define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuq_ymm
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <4 x i64> %a0, %a1
%3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
@@ -1068,8 +1833,17 @@ define <4 x i64> @stack_fold_pmaxuq_ymm(
}
define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_ymm_mask
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <4 x i64>, <4 x i64>* %passthru
%3 = icmp ugt <4 x i64> %a0, %a1
@@ -1081,8 +1855,15 @@ define <4 x i64> @stack_fold_pmaxuq_ymm_
}
define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz
- ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmaxuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <4 x i64> %a0, %a1
%3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
@@ -1093,8 +1874,14 @@ define <4 x i64> @stack_fold_pmaxuq_ymm_
}
define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuw
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -1102,8 +1889,14 @@ define <8 x i16> @stack_fold_pmaxuw(<8 x
}
define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuw_ymm
- ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ugt <16 x i16> %a0, %a1
%3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
@@ -1112,8 +1905,14 @@ define <16 x i16> @stack_fold_pmaxuw_ymm
declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminsb
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -1121,8 +1920,14 @@ define <16 x i8> @stack_fold_pminsb(<16
}
define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminsb_ymm
- ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <32 x i8> %a0, %a1
%3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
@@ -1130,8 +1935,14 @@ define <32 x i8> @stack_fold_pminsb_ymm(
}
define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminsd
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -1139,8 +1950,14 @@ define <4 x i32> @stack_fold_pminsd(<4 x
}
define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminsd_ymm
- ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i32> %a0, %a1
%3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
@@ -1148,8 +1965,14 @@ define <8 x i32> @stack_fold_pminsd_ymm(
}
define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminsq
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <2 x i64> %a0, %a1
%3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
@@ -1157,8 +1980,14 @@ define <2 x i64> @stack_fold_pminsq(<2 x
}
define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminsq_ymm
- ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <4 x i64> %a0, %a1
%3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
@@ -1166,8 +1995,14 @@ define <4 x i64> @stack_fold_pminsq_ymm(
}
define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminsw
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -1175,8 +2010,14 @@ define <8 x i16> @stack_fold_pminsw(<8 x
}
define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminsw_ymm
- ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp slt <16 x i16> %a0, %a1
%3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
@@ -1184,8 +2025,14 @@ define <16 x i16> @stack_fold_pminsw_ymm
}
define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminub
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -1193,8 +2040,14 @@ define <16 x i8> @stack_fold_pminub(<16
}
define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminub_ymm
- ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <32 x i8> %a0, %a1
%3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1
@@ -1202,8 +2055,14 @@ define <32 x i8> @stack_fold_pminub_ymm(
}
define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminud
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -1211,8 +2070,14 @@ define <4 x i32> @stack_fold_pminud(<4 x
}
define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminud_ymm
- ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i32> %a0, %a1
%3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1
@@ -1220,8 +2085,14 @@ define <8 x i32> @stack_fold_pminud_ymm(
}
define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminuq
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <2 x i64> %a0, %a1
%3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %a1
@@ -1229,8 +2100,14 @@ define <2 x i64> @stack_fold_pminuq(<2 x
}
define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pminuq_ymm
- ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <4 x i64> %a0, %a1
%3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %a1
@@ -1238,8 +2115,14 @@ define <4 x i64> @stack_fold_pminuq_ymm(
}
define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminuw
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -1247,8 +2130,14 @@ define <8 x i16> @stack_fold_pminuw(<8 x
}
define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminuw_ymm
- ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = icmp ult <16 x i16> %a0, %a1
%3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1
@@ -1256,8 +2145,15 @@ define <16 x i16> @stack_fold_pminuw_ymm
}
define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovdw
- ;CHECK: vpmovdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -1265,8 +2161,15 @@ define <8 x i16> @stack_fold_vpmovdw(<8
declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovqd
- ;CHECK: vpmovqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = trunc <4 x i64> %a0 to <4 x i32>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <4 x i32> %1
@@ -1274,8 +2177,15 @@ define <4 x i32> @stack_fold_vpmovqd(<4
declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovwb
- ;CHECK: vpmovwb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovwb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovwb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = trunc <16 x i16> %a0 to <16 x i8>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -1283,8 +2193,15 @@ define <16 x i8> @stack_fold_vpmovwb(<16
declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsdw
- ;CHECK: vpmovsdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -1292,8 +2209,15 @@ define <8 x i16> @stack_fold_vpmovsdw(<8
declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovsqd
- ;CHECK: vpmovsqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovsqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <4 x i32> %1
@@ -1301,8 +2225,15 @@ define <4 x i32> @stack_fold_vpmovsqd(<4
declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovswb
- ;CHECK: vpmovswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -1310,8 +2241,14 @@ define <16 x i8> @stack_fold_vpmovswb(<1
declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbd
- ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i8> %2 to <4 x i32>
@@ -1319,8 +2256,14 @@ define <4 x i32> @stack_fold_pmovsxbd(<1
}
define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbd_ymm
- ;CHECK: vpmovsxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = sext <8 x i8> %2 to <8 x i32>
@@ -1328,8 +2271,14 @@ define <8 x i32> @stack_fold_pmovsxbd_ym
}
define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbq
- ;CHECK: vpmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i8> %2 to <2 x i64>
@@ -1337,8 +2286,14 @@ define <2 x i64> @stack_fold_pmovsxbq(<1
}
define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbq_ymm
- ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i8> %2 to <4 x i64>
@@ -1346,8 +2301,14 @@ define <4 x i64> @stack_fold_pmovsxbq_ym
}
define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbw
- ;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = sext <8 x i8> %2 to <8 x i16>
@@ -1355,16 +2316,28 @@ define <8 x i16> @stack_fold_pmovsxbw(<1
}
define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbw_ymm
- ;CHECK: vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <16 x i8> %a0 to <16 x i16>
ret <16 x i16> %2
}
define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxdq
- ;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i32> %2 to <2 x i64>
@@ -1372,16 +2345,28 @@ define <2 x i64> @stack_fold_pmovsxdq(<4
}
define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxdq_ymm
- ;CHECK: vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxdq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <4 x i32> %a0 to <4 x i64>
ret <4 x i64> %2
}
define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwd
- ;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i16> %2 to <4 x i32>
@@ -1389,16 +2374,28 @@ define <4 x i32> @stack_fold_pmovsxwd(<8
}
define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwd_ymm
- ;CHECK: vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sext <8 x i16> %a0 to <8 x i32>
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwq
- ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i16> %2 to <2 x i64>
@@ -1406,8 +2403,14 @@ define <2 x i64> @stack_fold_pmovsxwq(<8
}
define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwq_ymm
- ;CHECK: vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i16> %2 to <4 x i64>
@@ -1415,8 +2418,15 @@ define <4 x i64> @stack_fold_pmovsxwq_ym
}
define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusdw
- ;CHECK: vpmovusdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusdw %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x i16> %1
@@ -1424,8 +2434,15 @@ define <8 x i16> @stack_fold_vpmovusdw(<
declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovusqd
- ;CHECK: vpmovusqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovusqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovusqd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <4 x i32> %1
@@ -1433,8 +2450,15 @@ define <4 x i32> @stack_fold_vpmovusqd(<
declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_vpmovuswb
- ;CHECK: vpmovuswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+; CHECK-LABEL: stack_fold_vpmovuswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovuswb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <16 x i8> %1
@@ -1442,8 +2466,15 @@ define <16 x i8> @stack_fold_vpmovuswb(<
declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbd
- ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
%3 = bitcast <16 x i8> %2 to <4 x i32>
@@ -1451,8 +2482,15 @@ define <4 x i32> @stack_fold_pmovzxbd(<1
}
define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbd_ymm
- ;CHECK: vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = zext <8 x i8> %2 to <8 x i32>
@@ -1460,8 +2498,15 @@ define <8 x i32> @stack_fold_pmovzxbd_ym
}
define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbq
- ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
%3 = bitcast <16 x i8> %2 to <2 x i64>
@@ -1469,8 +2514,15 @@ define <2 x i64> @stack_fold_pmovzxbq(<1
}
define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbq_ymm
- ;CHECK: vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = zext <4 x i8> %2 to <4 x i64>
@@ -1478,8 +2530,15 @@ define <4 x i64> @stack_fold_pmovzxbq_ym
}
define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbw
- ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%3 = bitcast <16 x i8> %2 to <8 x i16>
@@ -1487,16 +2546,30 @@ define <8 x i16> @stack_fold_pmovzxbw(<1
}
define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbw_ymm
- ;CHECK: vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <16 x i8> %a0 to <16 x i16>
ret <16 x i16> %2
}
define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxdq
- ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%3 = bitcast <4 x i32> %2 to <2 x i64>
@@ -1504,16 +2577,30 @@ define <2 x i64> @stack_fold_pmovzxdq(<4
}
define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxdq_ymm
- ;CHECK: vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxdq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <4 x i32> %a0 to <4 x i64>
ret <4 x i64> %2
}
define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwd
- ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%3 = bitcast <8 x i16> %2 to <4 x i32>
@@ -1521,16 +2608,30 @@ define <4 x i32> @stack_fold_pmovzxwd(<8
}
define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwd_ymm
- ;CHECK: vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i32>
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwq
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
%3 = bitcast <8 x i16> %2 to <2 x i64>
@@ -1538,8 +2639,15 @@ define <2 x i64> @stack_fold_pmovzxwq(<8
}
define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_ymm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = zext <4 x i16> %2 to <4 x i64>
@@ -1547,8 +2655,16 @@ define <4 x i64> @stack_fold_pmovzxwq_ym
}
define <4 x i64> @stack_fold_pmovzxwq_maskz_ymm(<8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_maskz_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = zext <4 x i16> %2 to <4 x i64>
@@ -1559,8 +2675,16 @@ define <4 x i64> @stack_fold_pmovzxwq_ma
}
define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm
- ;CHECK: vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq_mask_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = zext <4 x i16> %2 to <4 x i64>
@@ -1571,8 +2695,14 @@ define <4 x i64> @stack_fold_pmovzxwq_ma
}
define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuldq
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <4 x i32> %a0 to <2 x i64>
%3 = bitcast <4 x i32> %a1 to <2 x i64>
@@ -1585,8 +2715,14 @@ define <2 x i64> @stack_fold_pmuldq(<4 x
}
define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuldq_ymm
- ;CHECK: vpmuldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x i32> %a0 to <4 x i64>
%3 = bitcast <8 x i32> %a1 to <4 x i64>
@@ -1599,8 +2735,14 @@ define <4 x i64> @stack_fold_pmuldq_ymm(
}
define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuludq
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <4 x i32> %a0 to <2 x i64>
%3 = bitcast <4 x i32> %a1 to <2 x i64>
@@ -1611,8 +2753,14 @@ define <2 x i64> @stack_fold_pmuludq(<4
}
define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuludq_ymm
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x i32> %a0 to <4 x i64>
%3 = bitcast <8 x i32> %a1 to <4 x i64>
@@ -1623,8 +2771,19 @@ define <4 x i64> @stack_fold_pmuludq_ymm
}
define <4 x i64> @stack_fold_pmuludq_ymm_mask(<4 x i64>* %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_ymm_mask
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x i32> %a0 to <4 x i64>
%3 = bitcast <8 x i32> %a1 to <4 x i64>
@@ -1639,8 +2798,17 @@ define <4 x i64> @stack_fold_pmuludq_ymm
}
define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pmuludq_ymm_maskz
- ;CHECK: vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = bitcast <8 x i32> %a0 to <4 x i64>
%3 = bitcast <8 x i32> %a1 to <4 x i64>
@@ -1654,8 +2822,14 @@ define <4 x i64> @stack_fold_pmuludq_ymm
}
define <4 x i32> @stack_fold_vpopcntd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntd
- ;CHECK: vpopcntd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a0)
ret <4 x i32> %2
@@ -1663,8 +2837,14 @@ define <4 x i32> @stack_fold_vpopcntd(<4
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readonly
define <8 x i32> @stack_fold_vpopcntd_ymm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntd_ymm
- ;CHECK: vpopcntd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a0)
ret <8 x i32> %2
@@ -1672,8 +2852,14 @@ define <8 x i32> @stack_fold_vpopcntd_ym
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readonly
define <2 x i64> @stack_fold_vpopcntq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntq
- ;CHECK: vpopcntq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a0)
ret <2 x i64> %2
@@ -1681,8 +2867,14 @@ define <2 x i64> @stack_fold_vpopcntq(<2
declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_vpopcntq_ymm(<4 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_vpopcntq_ymm
- ;CHECK: vpopcntq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpopcntq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpopcntq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a0)
ret <4 x i64> %2
@@ -1690,8 +2882,14 @@ define <4 x i64> @stack_fold_vpopcntq_ym
declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psadbw
- ;CHECK: vpsadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psadbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
ret <2 x i64> %2
@@ -1699,8 +2897,14 @@ define <2 x i64> @stack_fold_psadbw(<16
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psadbw_ymm
- ;CHECK: vpsadbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psadbw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
ret <4 x i64> %2
@@ -1708,8 +2912,14 @@ define <4 x i64> @stack_fold_psadbw_ymm(
declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pshufb
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -1717,8 +2927,17 @@ define <16 x i8> @stack_fold_pshufb(<16
declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_mask
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i8>, <16 x i8>* %passthru
%3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
@@ -1728,8 +2947,15 @@ define <16 x i8> @stack_fold_pshufb_mask
}
define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_maskz
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
%3 = bitcast i16 %mask to <16 x i1>
@@ -1738,8 +2964,14 @@ define <16 x i8> @stack_fold_pshufb_mask
}
define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pshufb_ymm
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
@@ -1747,8 +2979,17 @@ define <32 x i8> @stack_fold_pshufb_ymm(
declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
define <32 x i8> @stack_fold_pshufb_ymm_mask(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_ymm_mask
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x i8>, <32 x i8>* %passthru
%3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
@@ -1758,8 +2999,15 @@ define <32 x i8> @stack_fold_pshufb_ymm_
}
define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_pshufb_ymm_maskz
- ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
%3 = bitcast i32 %mask to <32 x i1>
@@ -1768,8 +3016,17 @@ define <32 x i8> @stack_fold_pshufb_ymm_
}
define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pshufd
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,2,1,0]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
@@ -1777,8 +3034,18 @@ define <4 x i32> @stack_fold_pshufd(<4 x
}
define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_mask
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1788,8 +3055,16 @@ define <4 x i32> @stack_fold_pshufd_mask
}
define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_maskz
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1799,8 +3074,17 @@ define <4 x i32> @stack_fold_pshufd_mask
}
define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pshufd_ymm
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
@@ -1808,8 +3092,18 @@ define <8 x i32> @stack_fold_pshufd_ymm(
}
define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_ymm_mask
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1818,8 +3112,16 @@ define <8 x i32> @stack_fold_pshufd_ymm_
}
define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufd_ymm_maskz
- ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1828,16 +3130,33 @@ define <8 x i32> @stack_fold_pshufd_ymm_
}
define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshufhw
- ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
ret <8 x i16> %2
}
define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_mask
- ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} = mem[0,1,2,3,7,6,4,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1846,8 +3165,16 @@ define <8 x i16> @stack_fold_pshufhw_mas
}
define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_maskz
- ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} {z} = mem[0,1,2,3,7,6,4,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1856,16 +3183,33 @@ define <8 x i16> @stack_fold_pshufhw_mas
}
define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshufhw_ymm
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
ret <16 x i16> %2
}
define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_ymm_mask
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
%3 = bitcast i16 %mask to <16 x i1>
@@ -1874,8 +3218,16 @@ define <16 x i16> @stack_fold_pshufhw_ym
}
define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshufhw_ymm_maskz
- ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
%3 = bitcast i16 %mask to <16 x i1>
@@ -1884,16 +3236,33 @@ define <16 x i16> @stack_fold_pshufhw_ym
}
define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshuflw
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %2
}
define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_mask
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} = mem[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1902,8 +3271,16 @@ define <8 x i16> @stack_fold_pshuflw_mas
}
define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_maskz
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
%3 = bitcast i8 %mask to <8 x i1>
@@ -1912,16 +3289,33 @@ define <8 x i16> @stack_fold_pshuflw_mas
}
define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshuflw_ymm
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %2
}
define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_ymm_mask
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_ymm_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
%3 = bitcast i16 %mask to <16 x i1>
@@ -1930,8 +3324,16 @@ define <16 x i16> @stack_fold_pshuflw_ym
}
define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
- ;CHECK-LABEL: stack_fold_pshuflw_ymm_maskz
- ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw_ymm_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
%3 = bitcast i16 %mask to <16 x i1>
@@ -1940,8 +3342,14 @@ define <16 x i16> @stack_fold_pshuflw_ym
}
define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pslld
- ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -1949,8 +3357,14 @@ define <4 x i32> @stack_fold_pslld(<4 x
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pslld_ymm
- ;CHECK: vpslld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
ret <8 x i32> %2
@@ -1958,24 +3372,44 @@ define <8 x i32> @stack_fold_pslld_ymm(<
declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) {
- ;CHECK-LABEL: stack_fold_pslldq
- ;CHECK: vpslldq $12, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0,1,2,3]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19>
ret <16 x i8> %2
}
define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) {
- ;CHECK-LABEL: stack_fold_pslldq_ymm
- ;CHECK: vpslldq $15, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslldq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpslldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[16]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
ret <32 x i8> %2
}
define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllq
- ;CHECK: vpsllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -1983,8 +3417,14 @@ define <2 x i64> @stack_fold_psllq(<2 x
declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllq_ymm
- ;CHECK: vpsllq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
ret <4 x i64> %2
@@ -1992,8 +3432,14 @@ define <4 x i64> @stack_fold_psllq_ymm(<
declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psllvd
- ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -2001,8 +3447,14 @@ define <4 x i32> @stack_fold_psllvd(<4 x
declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psllvd_ymm
- ;CHECK: vpsllvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %2
@@ -2010,8 +3462,14 @@ define <8 x i32> @stack_fold_psllvd_ymm(
declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllvq
- ;CHECK: vpsllvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -2019,8 +3477,14 @@ define <2 x i64> @stack_fold_psllvq(<2 x
declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllvq_ymm
- ;CHECK: vpsllvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
ret <4 x i64> %2
@@ -2028,8 +3492,14 @@ define <4 x i64> @stack_fold_psllvq_ymm(
declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllvw
- ;CHECK: vpsllvw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2037,8 +3507,14 @@ define <8 x i16> @stack_fold_psllvw(<8 x
declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllvw_ymm
- ;CHECK: vpsllvw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllvw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -2046,8 +3522,14 @@ define <16 x i16> @stack_fold_psllvw_ymm
declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllw
- ;CHECK: vpsllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2055,8 +3537,14 @@ define <8 x i16> @stack_fold_psllw(<8 x
declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllw_ymm
- ;CHECK: vpsllw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
ret <16 x i16> %2
@@ -2064,8 +3552,14 @@ define <16 x i16> @stack_fold_psllw_ymm(
declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrad
- ;CHECK: vpsrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrad:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -2073,8 +3567,14 @@ define <4 x i32> @stack_fold_psrad(<4 x
declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrad_ymm
- ;CHECK: vpsrad {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrad_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
ret <8 x i32> %2
@@ -2082,8 +3582,14 @@ define <8 x i32> @stack_fold_psrad_ymm(<
declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psraq
- ;CHECK: vpsraq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -2091,8 +3597,14 @@ define <2 x i64> @stack_fold_psraq(<2 x
declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psraq_ymm
- ;CHECK: vpsraq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1)
ret <4 x i64> %2
@@ -2100,8 +3612,14 @@ define <4 x i64> @stack_fold_psraq_ymm(<
declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psravd
- ;CHECK: vpsravd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -2109,8 +3627,14 @@ define <4 x i32> @stack_fold_psravd(<4 x
declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psravd_ymm
- ;CHECK: vpsravd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %2
@@ -2118,8 +3642,14 @@ define <8 x i32> @stack_fold_psravd_ymm(
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psravq
- ;CHECK: vpsravq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -2127,8 +3657,14 @@ define <2 x i64> @stack_fold_psravq(<2 x
declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psravq_ymm
- ;CHECK: vpsravq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
ret <4 x i64> %2
@@ -2136,8 +3672,14 @@ define <4 x i64> @stack_fold_psravq_ymm(
declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psravw
- ;CHECK: vpsravw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2145,8 +3687,14 @@ define <8 x i16> @stack_fold_psravw(<8 x
declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psravw_ymm
- ;CHECK: vpsravw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psravw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsravw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -2154,8 +3702,14 @@ define <16 x i16> @stack_fold_psravw_ymm
declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psraw
- ;CHECK: vpsraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2163,8 +3717,14 @@ define <8 x i16> @stack_fold_psraw(<8 x
declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psraw_ymm
- ;CHECK: vpsraw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
ret <16 x i16> %2
@@ -2172,8 +3732,14 @@ define <16 x i16> @stack_fold_psraw_ymm(
declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrld
- ;CHECK: vpsrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -2181,8 +3747,16 @@ define <4 x i32> @stack_fold_psrld(<4 x
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrld_ymm
- ;CHECK: vpsrld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrld_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
ret <8 x i32> %2
@@ -2190,24 +3764,44 @@ define <8 x i32> @stack_fold_psrld_ymm(<
declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) {
- ;CHECK-LABEL: stack_fold_psrldq
- ;CHECK: vpsrldq $12, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrldq $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i8> %2
}
define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) {
- ;CHECK-LABEL: stack_fold_psrldq_ymm
- ;CHECK: vpsrldq $15, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrldq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrldq $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = mem[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,mem[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %2
}
define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlq
- ;CHECK: vpsrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -2215,8 +3809,14 @@ define <2 x i64> @stack_fold_psrlq(<2 x
declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlq_ymm
- ;CHECK: vpsrlq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
ret <4 x i64> %2
@@ -2224,8 +3824,14 @@ define <4 x i64> @stack_fold_psrlq_ymm(<
declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvd
- ;CHECK: vpsrlvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -2233,8 +3839,14 @@ define <4 x i32> @stack_fold_psrlvd(<4 x
declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvd_ymm
- ;CHECK: vpsrlvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %2
@@ -2242,8 +3854,14 @@ define <8 x i32> @stack_fold_psrlvd_ymm(
declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvq
- ;CHECK: vpsrlvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -2251,8 +3869,14 @@ define <2 x i64> @stack_fold_psrlvq(<2 x
declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvq_ymm
- ;CHECK: vpsrlvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
ret <4 x i64> %2
@@ -2260,8 +3884,14 @@ define <4 x i64> @stack_fold_psrlvq_ymm(
declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvw
- ;CHECK: vpsrlvw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2269,8 +3899,14 @@ define <8 x i16> @stack_fold_psrlvw(<8 x
declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlvw_ymm
- ;CHECK: vpsrlvw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlvw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlvw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -2278,8 +3914,14 @@ define <16 x i16> @stack_fold_psrlvw_ymm
declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlw
- ;CHECK: vpsrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2287,8 +3929,14 @@ define <8 x i16> @stack_fold_psrlw(<8 x
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlw_ymm
- ;CHECK: vpsrlw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
ret <16 x i16> %2
@@ -2296,56 +3944,98 @@ define <16 x i16> @stack_fold_psrlw_ymm(
declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubb
- ;CHECK: vpsubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <16 x i8> %a0, %a1
ret <16 x i8> %2
}
define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubb_ymm
- ;CHECK: vpsubb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <32 x i8> %a0, %a1
ret <32 x i8> %2
}
define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psubd
- ;CHECK: vpsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <4 x i32> %a0, %a1
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psubd_ymm
- ;CHECK: vpsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubd_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <8 x i32> %a0, %a1
ret <8 x i32> %2
}
define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psubq
- ;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <2 x i64> %a0, %a1
ret <2 x i64> %2
}
define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psubq_ymm
- ;CHECK: vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubq_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <4 x i64> %a0, %a1
ret <4 x i64> %2
}
define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubsb
- ;CHECK: vpsubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -2353,8 +4043,14 @@ define <16 x i8> @stack_fold_psubsb(<16
declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubsb_ymm
- ;CHECK: vpsubsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
@@ -2362,8 +4058,14 @@ define <32 x i8> @stack_fold_psubsb_ymm(
declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubsw
- ;CHECK: vpsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2371,8 +4073,14 @@ define <8 x i16> @stack_fold_psubsw(<8 x
declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubsw_ymm
- ;CHECK: vpsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -2380,8 +4088,14 @@ define <16 x i16> @stack_fold_psubsw_ymm
declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubusb
- ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -2389,8 +4103,14 @@ define <16 x i8> @stack_fold_psubusb(<16
declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubusb_ymm
- ;CHECK: vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusb_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
@@ -2398,8 +4118,14 @@ define <32 x i8> @stack_fold_psubusb_ymm
declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubusw
- ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -2407,8 +4133,14 @@ define <8 x i16> @stack_fold_psubusw(<8
declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubusw_ymm
- ;CHECK: vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
@@ -2416,32 +4148,61 @@ define <16 x i16> @stack_fold_psubusw_ym
declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubw
- ;CHECK: vpsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <8 x i16> %a0, %a1
ret <8 x i16> %2
}
define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubw_ymm
- ;CHECK: vpsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = sub <16 x i16> %a0, %a1
ret <16 x i16> %2
}
define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhbw
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i8> %2
}
define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_mask
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 {%k1} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm2 {%k1} = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
%3 = bitcast i16 %mask to <16 x i1>
@@ -2452,8 +4213,16 @@ define <16 x i8> @stack_fold_punpckhbw_m
}
define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_maskz
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 {%k1} {z} = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
%3 = bitcast i16 %mask to <16 x i1>
@@ -2462,16 +4231,33 @@ define <16 x i8> @stack_fold_punpckhbw_m
}
define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhbw_ymm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
ret <32 x i8> %2
}
define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_mask_ymm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_mask_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 {%k1} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm2 {%k1} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
%3 = bitcast i32 %mask to <32 x i1>
@@ -2482,8 +4268,16 @@ define <32 x i8> @stack_fold_punpckhbw_m
}
define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
- ;CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm
- ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
%3 = bitcast i32 %mask to <32 x i1>
@@ -2492,8 +4286,18 @@ define <32 x i8> @stack_fold_punpckhbw_m
}
define <4 x i64> @stack_fold_shufi64x2_maskz(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
- ;CHECK-LABEL: stack_fold_shufi64x2_maskz
- ;CHECK: vshufi64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi64x2_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vshufi64x2 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
%3 = bitcast i8 %mask to <8 x i1>
@@ -2503,8 +4307,18 @@ define <4 x i64> @stack_fold_shufi64x2_m
}
define <8 x i32> @stack_fold_shufi32x4_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
- ;CHECK-LABEL: stack_fold_shufi32x4_maskz
- ;CHECK: vshufi32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+; CHECK-LABEL: stack_fold_shufi32x4_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; CHECK-NEXT: vshufi32x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload
+; CHECK-NEXT: # ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%3 = bitcast i8 %mask to <8 x i1>
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vnni.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vnni.ll?rev=369876&r1=369875&r2=369876&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vnni.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vnni.ll Sun Aug 25 13:48:14 2019
@@ -1,27 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vbmi,+avx512cd,+avx512vpopcntdq,+avx512vnni < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
define <16 x i32> @stack_fold_vpdpwssd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
- ;CHECK-LABEL: stack_fold_vpdpwssd:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_vpdpwssd_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
- ;CHECK-LABEL: stack_fold_vpdpwssd_commuted:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_vpdpwssd_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssd_mask:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i32>, <16 x i32>* %a0
%3 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %2, <16 x i32> %a1, <16 x i32> %a2)
@@ -31,8 +53,17 @@ define <16 x i32> @stack_fold_vpdpwssd_m
}
define <16 x i32> @stack_fold_vpdpwssd_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssd_mask_commuted:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i32>, <16 x i32>* %a0
%3 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %2, <16 x i32> %a2, <16 x i32> %a1)
@@ -42,8 +73,15 @@ define <16 x i32> @stack_fold_vpdpwssd_m
}
define <16 x i32> @stack_fold_vpdpwssd_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssd_maskz:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovw (%rdi), %k1
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
%3 = load i16, i16* %mask
@@ -53,8 +91,15 @@ define <16 x i32> @stack_fold_vpdpwssd_m
}
define <16 x i32> @stack_fold_vpdpwssd_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssd_maskz_commuted:
- ;CHECK: vpdpwssd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssd_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovw (%rdi), %k1
+; CHECK-NEXT: vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
%3 = load i16, i16* %mask
@@ -64,24 +109,45 @@ define <16 x i32> @stack_fold_vpdpwssd_m
}
define <16 x i32> @stack_fold_vpdpwssds(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
- ;CHECK-LABEL: stack_fold_vpdpwssds:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_vpdpwssds_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2) {
- ;CHECK-LABEL: stack_fold_vpdpwssds_commuted:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
ret <16 x i32> %2
}
define <16 x i32> @stack_fold_vpdpwssds_mask(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssds_mask:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i32>, <16 x i32>* %a0
%3 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %2, <16 x i32> %a1, <16 x i32> %a2)
@@ -91,8 +157,17 @@ define <16 x i32> @stack_fold_vpdpwssds_
}
define <16 x i32> @stack_fold_vpdpwssds_mask_commuted(<16 x i32>* %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssds_mask_commuted:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds_mask_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <16 x i32>, <16 x i32>* %a0
%3 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %2, <16 x i32> %a2, <16 x i32> %a1)
@@ -102,8 +177,15 @@ define <16 x i32> @stack_fold_vpdpwssds_
}
define <16 x i32> @stack_fold_vpdpwssds_maskz(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssds_maskz:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds_maskz:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovw (%rdi), %k1
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2)
%3 = load i16, i16* %mask
@@ -113,8 +195,15 @@ define <16 x i32> @stack_fold_vpdpwssds_
}
define <16 x i32> @stack_fold_vpdpwssds_maskz_commuted(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16* %mask) {
- ;CHECK-LABEL: stack_fold_vpdpwssds_maskz_commuted:
- ;CHECK: vpdpwssds {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+; CHECK-LABEL: stack_fold_vpdpwssds_maskz_commuted:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: kmovw (%rdi), %k1
+; CHECK-NEXT: vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %a0, <16 x i32> %a2, <16 x i32> %a1)
%3 = load i16, i16* %mask
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll?rev=369876&r1=369875&r2=369876&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-sse42.ll Sun Aug 25 13:48:14 2019
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+aes,+pclmul < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,8 +10,14 @@ target triple = "x86_64-unknown-unknown"
; relevant registers and check that the reload is correctly folded into the instruction.
define <2 x i64> @stack_fold_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_aesdec
- ;CHECK: aesdec {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aesdec:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aesdec {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -18,8 +25,14 @@ define <2 x i64> @stack_fold_aesdec(<2 x
declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @stack_fold_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_aesdeclast
- ;CHECK: aesdeclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aesdeclast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aesdeclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -27,8 +40,14 @@ define <2 x i64> @stack_fold_aesdeclast(
declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @stack_fold_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_aesenc
- ;CHECK: aesenc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aesenc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aesenc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -36,8 +55,14 @@ define <2 x i64> @stack_fold_aesenc(<2 x
declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @stack_fold_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_aesenclast
- ;CHECK: aesenclast {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aesenclast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aesenclast {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -45,8 +70,14 @@ define <2 x i64> @stack_fold_aesenclast(
declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @stack_fold_aesimc(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_aesimc
- ;CHECK: aesimc {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aesimc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aesimc {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
ret <2 x i64> %2
@@ -54,8 +85,14 @@ define <2 x i64> @stack_fold_aesimc(<2 x
declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
define <2 x i64> @stack_fold_aeskeygenassist(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_aeskeygenassist
- ;CHECK: aeskeygenassist $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_aeskeygenassist:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: aeskeygenassist $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
ret <2 x i64> %2
@@ -63,8 +100,45 @@ define <2 x i64> @stack_fold_aeskeygenas
declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
define i32 @stack_fold_crc32_32_8(i32 %a0, i8 %a1) {
- ;CHECK-LABEL: stack_fold_crc32_32_8
- ;CHECK: crc32b {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 1-byte Folded Reload
+; CHECK-LABEL: stack_fold_crc32_32_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: crc32b {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
ret i32 %2
@@ -72,8 +146,45 @@ define i32 @stack_fold_crc32_32_8(i32 %a
declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
define i32 @stack_fold_crc32_32_16(i32 %a0, i16 %a1) {
- ;CHECK-LABEL: stack_fold_crc32_32_16
- ;CHECK: crc32w {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 2-byte Folded Reload
+; CHECK-LABEL: stack_fold_crc32_32_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: crc32w {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
ret i32 %2
@@ -81,8 +192,45 @@ define i32 @stack_fold_crc32_32_16(i32 %
declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
define i32 @stack_fold_crc32_32_32(i32 %a0, i32 %a1) {
- ;CHECK-LABEL: stack_fold_crc32_32_32
- ;CHECK: crc32l {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_crc32_32_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: crc32l {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
ret i32 %2
@@ -90,8 +238,45 @@ define i32 @stack_fold_crc32_32_32(i32 %
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define i64 @stack_fold_crc32_64_64(i64 %a0, i64 %a1) {
- ;CHECK-LABEL: stack_fold_crc32_64_64
- ;CHECK: crc32q {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+; CHECK-LABEL: stack_fold_crc32_64_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: crc32q {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
ret i64 %2
@@ -99,8 +284,47 @@ define i64 @stack_fold_crc32_64_64(i64 %
declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
define <4 x i32> @stack_fold_movd_load(i32 %a0) {
- ;CHECK-LABEL: stack_fold_movd_load
- ;CHECK: movd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_movd_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubd %xmm1, %xmm0
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <4 x i32> zeroinitializer, i32 %a0, i32 0
; add forces execution domain
@@ -109,8 +333,45 @@ define <4 x i32> @stack_fold_movd_load(i
}
define i32 @stack_fold_movd_store(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_movd_store
- ;CHECK: movd {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+; CHECK-LABEL: stack_fold_movd_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: paddd %xmm1, %xmm0
+; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
; add forces execution domain
%1 = add <4 x i32> %a0, %a1
%2 = extractelement <4 x i32> %1, i32 0
@@ -119,8 +380,17 @@ define i32 @stack_fold_movd_store(<4 x i
}
define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_movq_load
- ;CHECK: movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_movq_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
; add forces execution domain
@@ -129,8 +399,45 @@ define <2 x i64> @stack_fold_movq_load(<
}
define i64 @stack_fold_movq_store(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_movq_store
- ;CHECK: movq {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+; CHECK-LABEL: stack_fold_movq_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: paddq %xmm1, %xmm0
+; CHECK-NEXT: movq %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
; add forces execution domain
%1 = add <2 x i64> %a0, %a1
%2 = extractelement <2 x i64> %1, i32 0
@@ -139,8 +446,14 @@ define i64 @stack_fold_movq_store(<2 x i
}
define <8 x i16> @stack_fold_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_mpsadbw
- ;CHECK: mpsadbw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_mpsadbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: mpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
ret <8 x i16> %2
@@ -148,8 +461,14 @@ define <8 x i16> @stack_fold_mpsadbw(<16
declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pabsb
- ;CHECK: pabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pabsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, zeroinitializer
%3 = sub <16 x i8> zeroinitializer, %a0
@@ -158,8 +477,14 @@ define <16 x i8> @stack_fold_pabsb(<16 x
}
define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pabsd
- ;CHECK: pabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <4 x i32> %a0, zeroinitializer
%3 = sub <4 x i32> zeroinitializer, %a0
@@ -168,8 +493,14 @@ define <4 x i32> @stack_fold_pabsd(<4 x
}
define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pabsw
- ;CHECK: pabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pabsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pabsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, zeroinitializer
%3 = sub <8 x i16> zeroinitializer, %a0
@@ -178,8 +509,14 @@ define <8 x i16> @stack_fold_pabsw(<8 x
}
define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packssdw
- ;CHECK: packssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packssdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
ret <8 x i16> %2
@@ -187,8 +524,14 @@ define <8 x i16> @stack_fold_packssdw(<4
declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packsswb
- ;CHECK: packsswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packsswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: packsswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
ret <16 x i8> %2
@@ -196,8 +539,14 @@ define <16 x i8> @stack_fold_packsswb(<8
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_packusdw
- ;CHECK: packusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packusdw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: packusdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
ret <8 x i16> %2
@@ -205,8 +554,14 @@ define <8 x i16> @stack_fold_packusdw(<4
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_packuswb
- ;CHECK: packuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_packuswb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
ret <16 x i8> %2
@@ -214,32 +569,56 @@ define <16 x i8> @stack_fold_packuswb(<8
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddb
- ;CHECK: paddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = add <16 x i8> %a0, %a1
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_paddd
- ;CHECK: paddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = add <4 x i32> %a0, %a1
ret <4 x i32> %2
}
define <2 x i64> @stack_fold_paddq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_paddq
- ;CHECK: paddq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = add <2 x i64> %a0, %a1
ret <2 x i64> %2
}
define <16 x i8> @stack_fold_paddsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddsb
- ;CHECK: paddsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -247,8 +626,14 @@ define <16 x i8> @stack_fold_paddsb(<16
declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_paddsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddsw
- ;CHECK: paddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -256,8 +641,14 @@ define <8 x i16> @stack_fold_paddsw(<8 x
declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_paddusb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_paddusb
- ;CHECK: paddusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -265,8 +656,14 @@ define <16 x i8> @stack_fold_paddusb(<16
declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_paddusw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddusw
- ;CHECK: paddusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -274,24 +671,45 @@ define <8 x i16> @stack_fold_paddusw(<8
declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_paddw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_paddw
- ;CHECK: paddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_paddw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: paddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = add <8 x i16> %a0, %a1
ret <8 x i16> %2
}
define <16 x i8> @stack_fold_palignr(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_palignr
- ;CHECK: palignr $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_palignr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: palignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a1, <16 x i8> %a0, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
ret <16 x i8> %2
}
define <16 x i8> @stack_fold_pand(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pand
- ;CHECK: pand {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pand:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pand {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = and <16 x i8> %a0, %a1
; add forces execution domain
@@ -300,8 +718,16 @@ define <16 x i8> @stack_fold_pand(<16 x
}
define <16 x i8> @stack_fold_pandn(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pandn
- ;CHECK: pandn {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pandn:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = xor <16 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%3 = and <16 x i8> %2, %a1
@@ -311,8 +737,14 @@ define <16 x i8> @stack_fold_pandn(<16 x
}
define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pavgb
- ;CHECK: pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pavgb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = zext <16 x i8> %a0 to <16 x i16>
%3 = zext <16 x i8> %a1 to <16 x i16>
@@ -324,8 +756,14 @@ define <16 x i8> @stack_fold_pavgb(<16 x
}
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pavgw
- ;CHECK: pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pavgw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pavgw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = zext <8 x i16> %a0 to <8 x i32>
%3 = zext <8 x i16> %a1 to <8 x i32>
@@ -337,8 +775,16 @@ define <8 x i16> @stack_fold_pavgw(<8 x
}
define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
- ;CHECK-LABEL: stack_fold_pblendvb
- ;CHECK: pblendvb %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pblendvb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pblendvb %xmm0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
ret <16 x i8> %2
@@ -346,16 +792,29 @@ define <16 x i8> @stack_fold_pblendvb(<1
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pblendw
- ;CHECK: pblendw $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pblendw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3,4,5,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %2
}
define <2 x i64> @stack_fold_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pclmulqdq
- ;CHECK: pclmulqdq $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pclmulqdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pclmulqdq $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
ret <2 x i64> %2
@@ -363,8 +822,14 @@ define <2 x i64> @stack_fold_pclmulqdq(<
declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqb
- ;CHECK: pcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>
@@ -372,8 +837,14 @@ define <16 x i8> @stack_fold_pcmpeqb(<16
}
define <4 x i32> @stack_fold_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqd
- ;CHECK: pcmpeqd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <4 x i32> %a0, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -381,8 +852,14 @@ define <4 x i32> @stack_fold_pcmpeqd(<4
}
define <2 x i64> @stack_fold_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqq
- ;CHECK: pcmpeqq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <2 x i64> %a0, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -390,8 +867,14 @@ define <2 x i64> @stack_fold_pcmpeqq(<2
}
define <8 x i16> @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpeqw
- ;CHECK: pcmpeqw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpeqw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp eq <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>
@@ -399,8 +882,17 @@ define <8 x i16> @stack_fold_pcmpeqw(<8
}
define i32 @stack_fold_pcmpestri(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpestri
- ;CHECK: pcmpestri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpestri:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
%2 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
ret i32 %2
@@ -408,8 +900,16 @@ define i32 @stack_fold_pcmpestri(<16 x i
declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpestrm
- ;CHECK: pcmpestrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpestrm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl $7, %eax
+; CHECK-NEXT: movl $7, %edx
+; CHECK-NEXT: pcmpestrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{rax},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
ret <16 x i8> %2
@@ -417,8 +917,14 @@ define <16 x i8> @stack_fold_pcmpestrm(<
declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpgtb
- ;CHECK: pcmpgtb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpgtb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, %a1
%3 = sext <16 x i1> %2 to <16 x i8>
@@ -426,8 +932,14 @@ define <16 x i8> @stack_fold_pcmpgtb(<16
}
define <4 x i32> @stack_fold_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpgtd
- ;CHECK: pcmpgtd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpgtd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <4 x i32> %a0, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -435,8 +947,14 @@ define <4 x i32> @stack_fold_pcmpgtd(<4
}
define <2 x i64> @stack_fold_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpgtq
- ;CHECK: pcmpgtq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpgtq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <2 x i64> %a0, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -444,8 +962,14 @@ define <2 x i64> @stack_fold_pcmpgtq(<2
}
define <8 x i16> @stack_fold_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpgtw
- ;CHECK: pcmpgtw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpgtw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1
%3 = sext <8 x i1> %2 to <8 x i16>
@@ -453,8 +977,15 @@ define <8 x i16> @stack_fold_pcmpgtw(<8
}
define i32 @stack_fold_pcmpistri(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpistri
- ;CHECK: pcmpistri $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpistri:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpistri $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
ret i32 %2
@@ -462,8 +993,14 @@ define i32 @stack_fold_pcmpistri(<16 x i
declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
define <16 x i8> @stack_fold_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pcmpistrm
- ;CHECK: pcmpistrm $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pcmpistrm:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pcmpistrm $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
ret <16 x i8> %2
@@ -475,10 +1012,47 @@ declare <16 x i8> @llvm.x86.sse42.pcmpis
; We can't naively fold pextrw as it only writes to a 16-bit memory location
; even though it can store to a 32-bit register.
define i16 @stack_fold_pextrw(<8 x i16> %a0) {
-; CHECK-LABEL: stack_fold_pextrw
-; CHECK: pextrw $1, {{%xmm[0-9][0-9]*}}, %[[GPR32:(e[a-z]+|r[0-9]+d)]]
-; CHECK: movl %[[GPR32]], {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Spill
-; CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+; CHECK-LABEL: stack_fold_pextrw:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-NEXT: addl $2, %eax
+; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
entry:
; add forces execution domain
%add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
@@ -488,9 +1062,45 @@ entry:
}
define i32 @stack_fold_pextrd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pextrd
- ;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
- ;CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+; CHECK-LABEL: stack_fold_pextrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: paddd %xmm1, %xmm0
+; CHECK-NEXT: pextrd $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
; add forces execution domain
%1 = add <4 x i32> %a0, %a1
%2 = extractelement <4 x i32> %1, i32 1
@@ -499,17 +1109,58 @@ define i32 @stack_fold_pextrd(<4 x i32>
}
define i64 @stack_fold_pextrq(<2 x i64> %a0) {
- ;CHECK-LABEL: stack_fold_pextrq
- ;CHECK: pextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
- ;CHECK: movq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+; CHECK-LABEL: stack_fold_pextrq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: pextrq $1, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = extractelement <2 x i64> %a0, i32 1
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
ret i64 %1
}
define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_phaddd
- ;CHECK: phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phaddd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phaddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -517,8 +1168,14 @@ define <4 x i32> @stack_fold_phaddd(<4 x
declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @stack_fold_phaddsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_phaddsw
- ;CHECK: phaddsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phaddsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phaddsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -526,8 +1183,14 @@ define <8 x i16> @stack_fold_phaddsw(<8
declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_phaddw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_phaddw
- ;CHECK: phaddw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phaddw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phaddw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -535,8 +1198,14 @@ define <8 x i16> @stack_fold_phaddw(<8 x
declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_phminposuw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_phminposuw
- ;CHECK: phminposuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phminposuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phminposuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0)
ret <8 x i16> %2
@@ -544,8 +1213,14 @@ define <8 x i16> @stack_fold_phminposuw(
declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_phsubd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_phsubd
- ;CHECK: phsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phsubd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phsubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -553,8 +1228,14 @@ define <4 x i32> @stack_fold_phsubd(<4 x
declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @stack_fold_phsubsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_phsubsw
- ;CHECK: phsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phsubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phsubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -562,8 +1243,14 @@ define <8 x i16> @stack_fold_phsubsw(<8
declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_phsubw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_phsubw
- ;CHECK: phsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_phsubw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: phsubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -571,40 +1258,190 @@ define <8 x i16> @stack_fold_phsubw(<8 x
declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrb
- ;CHECK: pinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pinsrb $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrd
- ;CHECK: pinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pinsrd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
ret <4 x i32> %2
}
define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrq
- ;CHECK: pinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pinsrq $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
ret <2 x i64> %2
}
define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
- ;CHECK-LABEL: stack_fold_pinsrw
- ;CHECK: pinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+; CHECK-LABEL: stack_fold_pinsrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pinsrw $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
ret <8 x i16> %2
}
define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddubsw
- ;CHECK: pmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
ret <8 x i16> %2
@@ -612,8 +1449,14 @@ define <8 x i16> @stack_fold_pmaddubsw(<
declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaddwd
- ;CHECK: pmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaddwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
ret <4 x i32> %2
@@ -621,8 +1464,14 @@ define <4 x i32> @stack_fold_pmaddwd(<8
declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsb
- ;CHECK: pmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -630,8 +1479,14 @@ define <16 x i8> @stack_fold_pmaxsb(<16
}
define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsd
- ;CHECK: pmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -639,8 +1494,14 @@ define <4 x i32> @stack_fold_pmaxsd(<4 x
}
define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxsw
- ;CHECK: pmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp sgt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -648,8 +1509,14 @@ define <8 x i16> @stack_fold_pmaxsw(<8 x
}
define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxub
- ;CHECK: pmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ugt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -657,8 +1524,14 @@ define <16 x i8> @stack_fold_pmaxub(<16
}
define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxud
- ;CHECK: pmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ugt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -666,8 +1539,14 @@ define <4 x i32> @stack_fold_pmaxud(<4 x
}
define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmaxuw
- ;CHECK: pmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmaxuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ugt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -675,8 +1554,14 @@ define <8 x i16> @stack_fold_pmaxuw(<8 x
}
define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminsb
- ;CHECK: pminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp slt <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -684,8 +1569,14 @@ define <16 x i8> @stack_fold_pminsb(<16
}
define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminsd
- ;CHECK: pminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp slt <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -693,8 +1584,14 @@ define <4 x i32> @stack_fold_pminsd(<4 x
}
define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminsw
- ;CHECK: pminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp slt <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -702,8 +1599,14 @@ define <8 x i16> @stack_fold_pminsw(<8 x
}
define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pminub
- ;CHECK: pminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminub {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ult <16 x i8> %a0, %a1
%3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %a1
@@ -711,8 +1614,14 @@ define <16 x i8> @stack_fold_pminub(<16
}
define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pminud
- ;CHECK: pminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminud:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminud {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ult <4 x i32> %a0, %a1
%3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %a1
@@ -720,8 +1629,14 @@ define <4 x i32> @stack_fold_pminud(<4 x
}
define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pminuw
- ;CHECK: pminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pminuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pminuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = icmp ult <8 x i16> %a0, %a1
%3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %a1
@@ -729,8 +1644,14 @@ define <8 x i16> @stack_fold_pminuw(<8 x
}
define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbd
- ;CHECK: pmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i8> %2 to <4 x i32>
@@ -738,8 +1659,14 @@ define <4 x i32> @stack_fold_pmovsxbd(<1
}
define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbq
- ;CHECK: pmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i8> %2 to <2 x i64>
@@ -747,8 +1674,14 @@ define <2 x i64> @stack_fold_pmovsxbq(<1
}
define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxbw
- ;CHECK: pmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = sext <8 x i8> %2 to <8 x i16>
@@ -756,8 +1689,14 @@ define <8 x i16> @stack_fold_pmovsxbw(<1
}
define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxdq
- ;CHECK: pmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i32> %2 to <2 x i64>
@@ -765,8 +1704,14 @@ define <2 x i64> @stack_fold_pmovsxdq(<4
}
define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwd
- ;CHECK: pmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = sext <4 x i16> %2 to <4 x i32>
@@ -774,8 +1719,14 @@ define <4 x i32> @stack_fold_pmovsxwd(<8
}
define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovsxwq
- ;CHECK: pmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovsxwq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%3 = sext <2 x i16> %2 to <2 x i64>
@@ -783,8 +1734,15 @@ define <2 x i64> @stack_fold_pmovsxwq(<8
}
define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbd
- ;CHECK: pmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
%3 = bitcast <16 x i8> %2 to <4 x i32>
@@ -792,8 +1750,15 @@ define <4 x i32> @stack_fold_pmovzxbd(<1
}
define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbq
- ;CHECK: pmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
%3 = bitcast <16 x i8> %2 to <2 x i64>
@@ -801,8 +1766,15 @@ define <2 x i64> @stack_fold_pmovzxbq(<1
}
define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxbw
- ;CHECK: pmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%3 = bitcast <16 x i8> %2 to <8 x i16>
@@ -810,8 +1782,15 @@ define <8 x i16> @stack_fold_pmovzxbw(<1
}
define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxdq
- ;CHECK: pmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%3 = bitcast <4 x i32> %2 to <2 x i64>
@@ -819,8 +1798,15 @@ define <2 x i64> @stack_fold_pmovzxdq(<4
}
define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwd
- ;CHECK: pmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%3 = bitcast <8 x i16> %2 to <4 x i32>
@@ -828,8 +1814,15 @@ define <4 x i32> @stack_fold_pmovzxwd(<8
}
define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pmovzxwq
- ;CHECK: pmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmovzxwq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
%3 = bitcast <8 x i16> %2 to <2 x i64>
@@ -837,8 +1830,14 @@ define <2 x i64> @stack_fold_pmovzxwq(<8
}
define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuldq
- ;CHECK: pmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmuldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <4 x i32> %a0 to <2 x i64>
%3 = bitcast <4 x i32> %a1 to <2 x i64>
@@ -851,8 +1850,14 @@ define <2 x i64> @stack_fold_pmuldq(<4 x
}
define <8 x i16> @stack_fold_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmulhrsw
- ;CHECK: pmulhrsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulhrsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -860,8 +1865,14 @@ define <8 x i16> @stack_fold_pmulhrsw(<8
declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_pmulhuw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmulhuw
- ;CHECK: pmulhuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulhuw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -869,8 +1880,14 @@ define <8 x i16> @stack_fold_pmulhuw(<8
declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_pmulhw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmulhw
- ;CHECK: pmulhw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulhw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmulhw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -878,24 +1895,42 @@ define <8 x i16> @stack_fold_pmulhw(<8 x
declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_pmulld(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmulld
- ;CHECK: pmulld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmulld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = mul <4 x i32> %a0, %a1
ret <4 x i32> %2
}
define <8 x i16> @stack_fold_pmullw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_pmullw
- ;CHECK: pmullw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmullw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmullw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = mul <8 x i16> %a0, %a1
ret <8 x i16> %2
}
define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pmuludq
- ;CHECK: pmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pmuludq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pmuludq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = bitcast <4 x i32> %a0 to <2 x i64>
%3 = bitcast <4 x i32> %a1 to <2 x i64>
@@ -906,8 +1941,16 @@ define <2 x i64> @stack_fold_pmuludq(<4
}
define <16 x i8> @stack_fold_por(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_por
- ;CHECK: por {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_por:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = or <16 x i8> %a0, %a1
; add forces execution domain
@@ -916,8 +1959,14 @@ define <16 x i8> @stack_fold_por(<16 x i
}
define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psadbw
- ;CHECK: psadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psadbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
ret <2 x i64> %2
@@ -925,8 +1974,14 @@ define <2 x i64> @stack_fold_psadbw(<16
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pshufb
- ;CHECK: pshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pshufb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -934,32 +1989,59 @@ define <16 x i8> @stack_fold_pshufb(<16
declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
- ;CHECK-LABEL: stack_fold_pshufd
- ;CHECK: pshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,2,1,0]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i32> %2
}
define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshufhw
- ;CHECK: pshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshufhw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pshufhw $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[0,1,2,3,7,6,4,4]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
ret <8 x i16> %2
}
define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
- ;CHECK-LABEL: stack_fold_pshuflw
- ;CHECK: pshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pshuflw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = mem[3,2,1,0,4,5,6,7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %2
}
define <16 x i8> @stack_fold_psignb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psignb
- ;CHECK: psignb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psignb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psignb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -967,8 +2049,14 @@ define <16 x i8> @stack_fold_psignb(<16
declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @stack_fold_psignd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psignd
- ;CHECK: psignd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psignd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psignd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -976,8 +2064,14 @@ define <4 x i32> @stack_fold_psignd(<4 x
declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @stack_fold_psignw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psignw
- ;CHECK: psignw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psignw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psignw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -985,8 +2079,14 @@ define <8 x i16> @stack_fold_psignw(<8 x
declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_pslld
- ;CHECK: pslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pslld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pslld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -994,8 +2094,14 @@ define <4 x i32> @stack_fold_pslld(<4 x
declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psllq
- ;CHECK: psllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psllq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -1003,8 +2109,14 @@ define <2 x i64> @stack_fold_psllq(<2 x
declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psllw
- ;CHECK: psllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psllw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psllw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -1012,8 +2124,14 @@ define <8 x i16> @stack_fold_psllw(<8 x
declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrad
- ;CHECK: psrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrad:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psrad {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -1021,8 +2139,14 @@ define <4 x i32> @stack_fold_psrad(<4 x
declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psraw
- ;CHECK: psraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psraw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psraw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -1030,8 +2154,14 @@ define <8 x i16> @stack_fold_psraw(<8 x
declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psrld
- ;CHECK: psrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrld:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psrld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
ret <4 x i32> %2
@@ -1039,8 +2169,14 @@ define <4 x i32> @stack_fold_psrld(<4 x
declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psrlq
- ;CHECK: psrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psrlq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %2
@@ -1048,8 +2184,14 @@ define <2 x i64> @stack_fold_psrlq(<2 x
declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psrlw
- ;CHECK: psrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psrlw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psrlw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -1057,32 +2199,56 @@ define <8 x i16> @stack_fold_psrlw(<8 x
declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubb
- ;CHECK: psubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <16 x i8> %a0, %a1
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_psubd
- ;CHECK: psubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <4 x i32> %a0, %a1
ret <4 x i32> %2
}
define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_psubq
- ;CHECK: psubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <2 x i64> %a0, %a1
ret <2 x i64> %2
}
define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubsb
- ;CHECK: psubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubsb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -1090,8 +2256,14 @@ define <16 x i8> @stack_fold_psubsb(<16
declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubsw
- ;CHECK: psubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubsw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -1099,8 +2271,14 @@ define <8 x i16> @stack_fold_psubsw(<8 x
declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_psubusb
- ;CHECK: psubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubusb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
@@ -1108,8 +2286,14 @@ define <16 x i8> @stack_fold_psubusb(<16
declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubusw
- ;CHECK: psubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubusw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubusw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
@@ -1117,16 +2301,30 @@ define <8 x i16> @stack_fold_psubusw(<8
declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_psubw
- ;CHECK: psubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_psubw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: psubw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = sub <8 x i16> %a0, %a1
ret <8 x i16> %2
}
define i32 @stack_fold_ptest(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_ptest
- ;CHECK: ptest {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_ptest:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: ptest {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
ret i32 %2
@@ -1134,16 +2332,32 @@ define i32 @stack_fold_ptest(<2 x i64> %
declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhbw
- ;CHECK: punpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_punpckhdq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhdq
- ;CHECK: punpckhdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubd %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
; add forces execution domain
@@ -1152,8 +2366,17 @@ define <4 x i32> @stack_fold_punpckhdq(<
}
define <2 x i64> @stack_fold_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhqdq
- ;CHECK: punpckhqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhqdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
; add forces execution domain
@@ -1162,24 +2385,47 @@ define <2 x i64> @stack_fold_punpckhqdq(
}
define <8 x i16> @stack_fold_punpckhwd(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_punpckhwd
- ;CHECK: punpckhwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckhwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
ret <8 x i16> %2
}
define <16 x i8> @stack_fold_punpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_punpcklbw
- ;CHECK: punpcklbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpcklbw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
ret <16 x i8> %2
}
define <4 x i32> @stack_fold_punpckldq(<4 x i32> %a0, <4 x i32> %a1) {
- ;CHECK-LABEL: stack_fold_punpckldq
- ;CHECK: punpckldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpckldq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubd %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
; add forces execution domain
@@ -1188,8 +2434,17 @@ define <4 x i32> @stack_fold_punpckldq(<
}
define <2 x i64> @stack_fold_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1) {
- ;CHECK-LABEL: stack_fold_punpcklqdq
- ;CHECK: punpcklqdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpcklqdq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubq %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
; add forces execution domain
@@ -1198,16 +2453,31 @@ define <2 x i64> @stack_fold_punpcklqdq(
}
define <8 x i16> @stack_fold_punpcklwd(<8 x i16> %a0, <8 x i16> %a1) {
- ;CHECK-LABEL: stack_fold_punpcklwd
- ;CHECK: punpcklwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_punpcklwd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i16> %2
}
define <16 x i8> @stack_fold_pxor(<16 x i8> %a0, <16 x i8> %a1) {
- ;CHECK-LABEL: stack_fold_pxor
- ;CHECK: pxor {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+; CHECK-LABEL: stack_fold_pxor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: pxor {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: psubb %xmm1, %xmm0
+; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = xor <16 x i8> %a0, %a1
; add forces execution domain
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-lwp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-lwp.ll?rev=369876&r1=369875&r2=369876&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-lwp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-lwp.ll Sun Aug 25 13:48:14 2019
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+lwp < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -9,9 +10,48 @@ target triple = "x86_64-unknown-unknown"
; relevant registers and check that the reload is correctly folded into the instruction.
define i8 @stack_fold_lwpins_u32(i32 %a0, i32 %a1) {
-; CHECK-LABEL: stack_fold_lwpins_u32
+; CHECK-LABEL: stack_fold_lwpins_u32:
; CHECK: # %bb.0:
-; CHECK: lwpins $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: lwpins $2814, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: # imm = 0xAFE
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 2814)
ret i8 %2
@@ -19,9 +59,48 @@ define i8 @stack_fold_lwpins_u32(i32 %a0
declare i8 @llvm.x86.lwpins32(i32, i32, i32)
define i8 @stack_fold_lwpins_u64(i64 %a0, i32 %a1) {
-; CHECK-LABEL: stack_fold_lwpins_u64
+; CHECK-LABEL: stack_fold_lwpins_u64:
; CHECK: # %bb.0:
-; CHECK: lwpins $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: lwpins $2814, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload
+; CHECK-NEXT: # imm = 0xAFE
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2814)
ret i8 %2
@@ -29,9 +108,47 @@ define i8 @stack_fold_lwpins_u64(i64 %a0
declare i8 @llvm.x86.lwpins64(i64, i32, i32)
define void @stack_fold_lwpval_u32(i32 %a0, i32 %a1) {
-; CHECK-LABEL: stack_fold_lwpval_u32
+; CHECK-LABEL: stack_fold_lwpval_u32:
; CHECK: # %bb.0:
-; CHECK: lwpval $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-NEXT: lwpval $2814, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-NEXT: # imm = 0xAFE
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 2814)
ret void
@@ -39,9 +156,47 @@ define void @stack_fold_lwpval_u32(i32 %
declare void @llvm.x86.lwpval32(i32, i32, i32)
define void @stack_fold_lwpval_u64(i64 %a0, i32 %a1) {
-; CHECK-LABEL: stack_fold_lwpval_u64
+; CHECK-LABEL: stack_fold_lwpval_u64:
; CHECK: # %bb.0:
-; CHECK: lwpval $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; CHECK-NEXT: lwpval $2814, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 4-byte Folded Reload
+; CHECK-NEXT: # imm = 0xAFE
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 2814)
ret void
More information about the llvm-commits
mailing list