[llvm] r294858 - [AVX-512] Add VPMINS/MINU/MAXS/MAXU instructions to load folding tables.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 11 09:35:28 PST 2017
Author: ctopper
Date: Sat Feb 11 11:35:28 2017
New Revision: 294858
URL: http://llvm.org/viewvc/llvm-project?rev=294858&view=rev
Log:
[AVX-512] Add VPMINS/MINU/MAXS/MAXU instructions to load folding tables.
Modified:
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=294858&r1=294857&r2=294858&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Sat Feb 11 11:35:28 2017
@@ -1929,14 +1929,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPINSRWZrr, X86::VPINSRWZrm, 0 },
{ X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
{ X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
+ { X86::VPMAXSBZrr, X86::VPMAXSBZrm, 0 },
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
+ { X86::VPMAXSWZrr, X86::VPMAXSWZrm, 0 },
+ { X86::VPMAXUBZrr, X86::VPMAXUBZrm, 0 },
{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
{ X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
+ { X86::VPMAXUWZrr, X86::VPMAXUWZrm, 0 },
+ { X86::VPMINSBZrr, X86::VPMINSBZrm, 0 },
{ X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
{ X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
+ { X86::VPMINSWZrr, X86::VPMINSWZrm, 0 },
+ { X86::VPMINUBZrr, X86::VPMINUBZrm, 0 },
{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
+ { X86::VPMINUWZrr, X86::VPMINUWZrm, 0 },
{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
{ X86::VPMULLDZrr, X86::VPMULLDZrm, 0 },
{ X86::VPMULLQZrr, X86::VPMULLQZrm, 0 },
@@ -2124,6 +2132,38 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
{ X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
{ X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
+ { X86::VPMAXSBZ128rr, X86::VPMAXSBZ128rm, 0 },
+ { X86::VPMAXSBZ256rr, X86::VPMAXSBZ256rm, 0 },
+ { X86::VPMAXSDZ128rr, X86::VPMAXSDZ128rm, 0 },
+ { X86::VPMAXSDZ256rr, X86::VPMAXSDZ256rm, 0 },
+ { X86::VPMAXSQZ128rr, X86::VPMAXSQZ128rm, 0 },
+ { X86::VPMAXSQZ256rr, X86::VPMAXSQZ256rm, 0 },
+ { X86::VPMAXSWZ128rr, X86::VPMAXSWZ128rm, 0 },
+ { X86::VPMAXSWZ256rr, X86::VPMAXSWZ256rm, 0 },
+ { X86::VPMAXUBZ128rr, X86::VPMAXUBZ128rm, 0 },
+ { X86::VPMAXUBZ256rr, X86::VPMAXUBZ256rm, 0 },
+ { X86::VPMAXUDZ128rr, X86::VPMAXUDZ128rm, 0 },
+ { X86::VPMAXUDZ256rr, X86::VPMAXUDZ256rm, 0 },
+ { X86::VPMAXUQZ128rr, X86::VPMAXUQZ128rm, 0 },
+ { X86::VPMAXUQZ256rr, X86::VPMAXUQZ256rm, 0 },
+ { X86::VPMAXUWZ128rr, X86::VPMAXUWZ128rm, 0 },
+ { X86::VPMAXUWZ256rr, X86::VPMAXUWZ256rm, 0 },
+ { X86::VPMINSBZ128rr, X86::VPMINSBZ128rm, 0 },
+ { X86::VPMINSBZ256rr, X86::VPMINSBZ256rm, 0 },
+ { X86::VPMINSDZ128rr, X86::VPMINSDZ128rm, 0 },
+ { X86::VPMINSDZ256rr, X86::VPMINSDZ256rm, 0 },
+ { X86::VPMINSQZ128rr, X86::VPMINSQZ128rm, 0 },
+ { X86::VPMINSQZ256rr, X86::VPMINSQZ256rm, 0 },
+ { X86::VPMINSWZ128rr, X86::VPMINSWZ128rm, 0 },
+ { X86::VPMINSWZ256rr, X86::VPMINSWZ256rm, 0 },
+ { X86::VPMINUBZ128rr, X86::VPMINUBZ128rm, 0 },
+ { X86::VPMINUBZ256rr, X86::VPMINUBZ256rm, 0 },
+ { X86::VPMINUDZ128rr, X86::VPMINUDZ128rm, 0 },
+ { X86::VPMINUDZ256rr, X86::VPMINUDZ256rm, 0 },
+ { X86::VPMINUQZ128rr, X86::VPMINUQZ128rm, 0 },
+ { X86::VPMINUQZ256rr, X86::VPMINUQZ256rm, 0 },
+ { X86::VPMINUWZ128rr, X86::VPMINUWZ128rm, 0 },
+ { X86::VPMINUWZ256rr, X86::VPMINUWZ256rm, 0 },
{ X86::VPMULDQZ128rr, X86::VPMULDQZ128rm, 0 },
{ X86::VPMULDQZ256rr, X86::VPMULDQZ256rm, 0 },
{ X86::VPMULLDZ128rr, X86::VPMULLDZ128rm, 0 },
@@ -2531,6 +2571,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
{ X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
+ { X86::VPMAXSBZrrkz, X86::VPMAXSBZrmkz, 0 },
+ { X86::VPMAXSDZrrkz, X86::VPMAXSDZrmkz, 0 },
+ { X86::VPMAXSQZrrkz, X86::VPMAXSQZrmkz, 0 },
+ { X86::VPMAXSWZrrkz, X86::VPMAXSWZrmkz, 0 },
+ { X86::VPMAXUBZrrkz, X86::VPMAXUBZrmkz, 0 },
+ { X86::VPMAXUDZrrkz, X86::VPMAXUDZrmkz, 0 },
+ { X86::VPMAXUQZrrkz, X86::VPMAXUQZrmkz, 0 },
+ { X86::VPMAXUWZrrkz, X86::VPMAXUWZrmkz, 0 },
+ { X86::VPMINSBZrrkz, X86::VPMINSBZrmkz, 0 },
+ { X86::VPMINSDZrrkz, X86::VPMINSDZrmkz, 0 },
+ { X86::VPMINSQZrrkz, X86::VPMINSQZrmkz, 0 },
+ { X86::VPMINSWZrrkz, X86::VPMINSWZrmkz, 0 },
+ { X86::VPMINUBZrrkz, X86::VPMINUBZrmkz, 0 },
+ { X86::VPMINUDZrrkz, X86::VPMINUDZrmkz, 0 },
+ { X86::VPMINUQZrrkz, X86::VPMINUQZrmkz, 0 },
+ { X86::VPMINUWZrrkz, X86::VPMINUWZrmkz, 0 },
{ X86::VPMULLDZrrkz, X86::VPMULLDZrmkz, 0 },
{ X86::VPMULLQZrrkz, X86::VPMULLQZrmkz, 0 },
{ X86::VPMULLWZrrkz, X86::VPMULLWZrmkz, 0 },
@@ -2638,6 +2694,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
{ X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
+ { X86::VPMAXSBZ256rrkz, X86::VPMAXSBZ256rmkz, 0 },
+ { X86::VPMAXSDZ256rrkz, X86::VPMAXSDZ256rmkz, 0 },
+ { X86::VPMAXSQZ256rrkz, X86::VPMAXSQZ256rmkz, 0 },
+ { X86::VPMAXSWZ256rrkz, X86::VPMAXSWZ256rmkz, 0 },
+ { X86::VPMAXUBZ256rrkz, X86::VPMAXUBZ256rmkz, 0 },
+ { X86::VPMAXUDZ256rrkz, X86::VPMAXUDZ256rmkz, 0 },
+ { X86::VPMAXUQZ256rrkz, X86::VPMAXUQZ256rmkz, 0 },
+ { X86::VPMAXUWZ256rrkz, X86::VPMAXUWZ256rmkz, 0 },
+ { X86::VPMINSBZ256rrkz, X86::VPMINSBZ256rmkz, 0 },
+ { X86::VPMINSDZ256rrkz, X86::VPMINSDZ256rmkz, 0 },
+ { X86::VPMINSQZ256rrkz, X86::VPMINSQZ256rmkz, 0 },
+ { X86::VPMINSWZ256rrkz, X86::VPMINSWZ256rmkz, 0 },
+ { X86::VPMINUBZ256rrkz, X86::VPMINUBZ256rmkz, 0 },
+ { X86::VPMINUDZ256rrkz, X86::VPMINUDZ256rmkz, 0 },
+ { X86::VPMINUQZ256rrkz, X86::VPMINUQZ256rmkz, 0 },
+ { X86::VPMINUWZ256rrkz, X86::VPMINUWZ256rmkz, 0 },
{ X86::VPMULDQZ256rrkz, X86::VPMULDQZ256rmkz, 0 },
{ X86::VPMULLDZ256rrkz, X86::VPMULLDZ256rmkz, 0 },
{ X86::VPMULLQZ256rrkz, X86::VPMULLQZ256rmkz, 0 },
@@ -2735,6 +2807,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
{ X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
+ { X86::VPMAXSBZ128rrkz, X86::VPMAXSBZ128rmkz, 0 },
+ { X86::VPMAXSDZ128rrkz, X86::VPMAXSDZ128rmkz, 0 },
+ { X86::VPMAXSQZ128rrkz, X86::VPMAXSQZ128rmkz, 0 },
+ { X86::VPMAXSWZ128rrkz, X86::VPMAXSWZ128rmkz, 0 },
+ { X86::VPMAXUBZ128rrkz, X86::VPMAXUBZ128rmkz, 0 },
+ { X86::VPMAXUDZ128rrkz, X86::VPMAXUDZ128rmkz, 0 },
+ { X86::VPMAXUQZ128rrkz, X86::VPMAXUQZ128rmkz, 0 },
+ { X86::VPMAXUWZ128rrkz, X86::VPMAXUWZ128rmkz, 0 },
+ { X86::VPMINSBZ128rrkz, X86::VPMINSBZ128rmkz, 0 },
+ { X86::VPMINSDZ128rrkz, X86::VPMINSDZ128rmkz, 0 },
+ { X86::VPMINSQZ128rrkz, X86::VPMINSQZ128rmkz, 0 },
+ { X86::VPMINSWZ128rrkz, X86::VPMINSWZ128rmkz, 0 },
+ { X86::VPMINUBZ128rrkz, X86::VPMINUBZ128rmkz, 0 },
+ { X86::VPMINUDZ128rrkz, X86::VPMINUDZ128rmkz, 0 },
+ { X86::VPMINUQZ128rrkz, X86::VPMINUQZ128rmkz, 0 },
+ { X86::VPMINUWZ128rrkz, X86::VPMINUWZ128rmkz, 0 },
{ X86::VPMULDQZ128rrkz, X86::VPMULDQZ128rmkz, 0 },
{ X86::VPMULLDZ128rrkz, X86::VPMULLDZ128rmkz, 0 },
{ X86::VPMULLQZ128rrkz, X86::VPMULLQZ128rmkz, 0 },
@@ -2996,6 +3084,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
+ { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
+ { X86::VPMAXSDZrrk, X86::VPMAXSDZrmk, 0 },
+ { X86::VPMAXSQZrrk, X86::VPMAXSQZrmk, 0 },
+ { X86::VPMAXSWZrrk, X86::VPMAXSWZrmk, 0 },
+ { X86::VPMAXUBZrrk, X86::VPMAXUBZrmk, 0 },
+ { X86::VPMAXUDZrrk, X86::VPMAXUDZrmk, 0 },
+ { X86::VPMAXUQZrrk, X86::VPMAXUQZrmk, 0 },
+ { X86::VPMAXUWZrrk, X86::VPMAXUWZrmk, 0 },
+ { X86::VPMINSBZrrk, X86::VPMINSBZrmk, 0 },
+ { X86::VPMINSDZrrk, X86::VPMINSDZrmk, 0 },
+ { X86::VPMINSQZrrk, X86::VPMINSQZrmk, 0 },
+ { X86::VPMINSWZrrk, X86::VPMINSWZrmk, 0 },
+ { X86::VPMINUBZrrk, X86::VPMINUBZrmk, 0 },
+ { X86::VPMINUDZrrk, X86::VPMINUDZrmk, 0 },
+ { X86::VPMINUQZrrk, X86::VPMINUQZrmk, 0 },
+ { X86::VPMINUWZrrk, X86::VPMINUWZrmk, 0 },
{ X86::VPMULDQZrrk, X86::VPMULDQZrmk, 0 },
{ X86::VPMULLDZrrk, X86::VPMULLDZrmk, 0 },
{ X86::VPMULLQZrrk, X86::VPMULLQZrmk, 0 },
@@ -3116,6 +3220,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
+ { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
+ { X86::VPMAXSDZ256rrk, X86::VPMAXSDZ256rmk, 0 },
+ { X86::VPMAXSQZ256rrk, X86::VPMAXSQZ256rmk, 0 },
+ { X86::VPMAXSWZ256rrk, X86::VPMAXSWZ256rmk, 0 },
+ { X86::VPMAXUBZ256rrk, X86::VPMAXUBZ256rmk, 0 },
+ { X86::VPMAXUDZ256rrk, X86::VPMAXUDZ256rmk, 0 },
+ { X86::VPMAXUQZ256rrk, X86::VPMAXUQZ256rmk, 0 },
+ { X86::VPMAXUWZ256rrk, X86::VPMAXUWZ256rmk, 0 },
+ { X86::VPMINSBZ256rrk, X86::VPMINSBZ256rmk, 0 },
+ { X86::VPMINSDZ256rrk, X86::VPMINSDZ256rmk, 0 },
+ { X86::VPMINSQZ256rrk, X86::VPMINSQZ256rmk, 0 },
+ { X86::VPMINSWZ256rrk, X86::VPMINSWZ256rmk, 0 },
+ { X86::VPMINUBZ256rrk, X86::VPMINUBZ256rmk, 0 },
+ { X86::VPMINUDZ256rrk, X86::VPMINUDZ256rmk, 0 },
+ { X86::VPMINUQZ256rrk, X86::VPMINUQZ256rmk, 0 },
+ { X86::VPMINUWZ256rrk, X86::VPMINUWZ256rmk, 0 },
{ X86::VPMULDQZ256rrk, X86::VPMULDQZ256rmk, 0 },
{ X86::VPMULLDZ256rrk, X86::VPMULLDZ256rmk, 0 },
{ X86::VPMULLQZ256rrk, X86::VPMULLQZ256rmk, 0 },
@@ -3227,6 +3347,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
+ { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
+ { X86::VPMAXSDZ128rrk, X86::VPMAXSDZ128rmk, 0 },
+ { X86::VPMAXSQZ128rrk, X86::VPMAXSQZ128rmk, 0 },
+ { X86::VPMAXSWZ128rrk, X86::VPMAXSWZ128rmk, 0 },
+ { X86::VPMAXUBZ128rrk, X86::VPMAXUBZ128rmk, 0 },
+ { X86::VPMAXUDZ128rrk, X86::VPMAXUDZ128rmk, 0 },
+ { X86::VPMAXUQZ128rrk, X86::VPMAXUQZ128rmk, 0 },
+ { X86::VPMAXUWZ128rrk, X86::VPMAXUWZ128rmk, 0 },
+ { X86::VPMINSBZ128rrk, X86::VPMINSBZ128rmk, 0 },
+ { X86::VPMINSDZ128rrk, X86::VPMINSDZ128rmk, 0 },
+ { X86::VPMINSQZ128rrk, X86::VPMINSQZ128rmk, 0 },
+ { X86::VPMINSWZ128rrk, X86::VPMINSWZ128rmk, 0 },
+ { X86::VPMINUBZ128rrk, X86::VPMINUBZ128rmk, 0 },
+ { X86::VPMINUDZ128rrk, X86::VPMINUDZ128rmk, 0 },
+ { X86::VPMINUQZ128rrk, X86::VPMINUQZ128rmk, 0 },
+ { X86::VPMINUWZ128rrk, X86::VPMINUWZ128rmk, 0 },
{ X86::VPMULDQZ128rrk, X86::VPMULDQZ128rmk, 0 },
{ X86::VPMULLDZ128rrk, X86::VPMULLDZ128rmk, 0 },
{ X86::VPMULLQZ128rrk, X86::VPMULLQZ128rmk, 0 },
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll?rev=294858&r1=294857&r2=294858&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll Sat Feb 11 11:35:28 2017
@@ -726,6 +726,328 @@ define <8 x i32> @stack_fold_pmaddwd_ymm
ret <8 x i32> %4
}
+define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsb
+ ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsb_ymm
+ ;CHECK: vpmaxsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsd
+ ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsd_ymm
+ ;CHECK: vpmaxsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsq
+ ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsq_ymm
+ ;CHECK: vpmaxsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsw
+ ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxsw_ymm
+ ;CHECK: vpmaxsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxub
+ ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxub_ymm
+ ;CHECK: vpmaxub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxud
+ ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxud_ymm
+ ;CHECK: vpmaxud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxuq
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_pmaxuq_mask
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <2 x i64>, <2 x i64>* %passthru
+ %3 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %2, i8 %mask)
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_pmaxuq_maskz
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask)
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxuq_ymm
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_pmaxuq_ymm_mask
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = load <4 x i64>, <4 x i64>* %passthru
+ %3 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %2, i8 %mask)
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz
+ ;CHECK: vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask)
+ ret <4 x i64> %2
+}
+
+define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxuw
+ ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pmaxuw_ymm
+ ;CHECK: vpmaxuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsb
+ ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsb_ymm
+ ;CHECK: vpminsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsd
+ ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsd_ymm
+ ;CHECK: vpminsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsq
+ ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsq_ymm
+ ;CHECK: vpminsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsw
+ ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pminsw_ymm
+ ;CHECK: vpminsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pminub
+ ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+ ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_pminub_ymm
+ ;CHECK: vpminub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+ ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pminud
+ ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+ ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_pminud_ymm
+ ;CHECK: vpminud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+ ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pminuq
+ ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+ ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_pminuq_ymm
+ ;CHECK: vpminuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+ ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pminuw
+ ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+ ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_pminuw_ymm
+ ;CHECK: vpminuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+ ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
;CHECK-LABEL: stack_fold_vpmovdw
;CHECK: vpmovdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
More information about the llvm-commits
mailing list