[llvm] r288591 - [AVX-512] Add many of the VPERM instructions to the load folding table. Move VPERMPDZri to the correct table.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Dec 3 11:37:39 PST 2016
Author: ctopper
Date: Sat Dec 3 13:37:39 2016
New Revision: 288591
URL: http://llvm.org/viewvc/llvm-project?rev=288591&view=rev
Log:
[AVX-512] Add many of the VPERM instructions to the load folding table. Move VPERMPDZri to the correct table.
Modified:
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=288591&r1=288590&r2=288591&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Sat Dec 3 13:37:39 2016
@@ -883,6 +883,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
+ { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMQZri, X86::VPERMQZmi, 0 },
{ X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
{ X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
@@ -914,6 +916,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
+ { X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
{ X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
@@ -1868,8 +1872,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
{ X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
{ X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
- { X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
+ { X86::VPERMBZrr, X86::VPERMBZrm, 0 },
+ { X86::VPERMDZrr, X86::VPERMDZrm, 0 },
+ { X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
+ { X86::VPERMQZrr, X86::VPERMQZrm, 0 },
+ { X86::VPERMWZrr, X86::VPERMWZrm, 0 },
{ X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
{ X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
@@ -2031,6 +2039,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
{ X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
{ X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
+ { X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
+ { X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
+ { X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
+ { X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
+ { X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
+ { X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
{ X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
{ X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
{ X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
@@ -2095,6 +2111,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
// AVX-512 masked foldable instructions
+ { X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
+ { X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
{ X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
{ X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
@@ -2112,6 +2130,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
// AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
+ { X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
{ X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
@@ -2322,6 +2342,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
{ X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
{ X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
+ { X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
+ { X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
+ { X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
+ { X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
+ { X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
+ { X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
{ X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
{ X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
@@ -2391,6 +2417,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
{ X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
{ X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
+ { X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
+ { X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
+ { X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
+ { X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
+ { X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
+ { X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
{ X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
{ X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
@@ -2456,6 +2488,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
{ X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
{ X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
+ { X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
+ { X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
{ X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
{ X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
@@ -2489,6 +2523,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
// AVX-512 masked foldable instructions
+ { X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
+ { X86::VPERMQZrik, X86::VPERMQZmik, 0 },
{ X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
{ X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
@@ -2506,6 +2542,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
// AVX-512VL 256-bit masked foldable instructions
+ { X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
+ { X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
{ X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
@@ -2605,6 +2643,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
{ X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
{ X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
+ { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
+ { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
+ { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
+ { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
+ { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
+ { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
{ X86::VPORDZrrk, X86::VPORDZrmk, 0 },
@@ -2677,6 +2721,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
{ X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
+ { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
+ { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
+ { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
+ { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
+ { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
+ { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
{ X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
@@ -2746,6 +2796,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
+ { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
+ { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
{ X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll?rev=288591&r1=288590&r2=288591&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll Sat Dec 3 13:37:39 2016
@@ -870,7 +870,190 @@ define <16 x i32> @stack_fold_pmaddwd_zm
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
%3 = bitcast i16 %mask to <16 x i1>
- ; load needed to keep the operation from being scheduled about the asm block
%4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
ret <16 x i32> %4
}
+
+define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_permd
+ ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1)
+ ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x double> @stack_fold_permpd(<8 x double> %a0) {
+ ;CHECK-LABEL: stack_fold_permpd
+ ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ ; fadd forces execution domain
+ %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+ ret <8 x double> %3
+}
+
+define <8 x double> @stack_fold_permpd_mask(<8 x double>* %passthru, <8 x double> %a0, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_permpd_mask
+ ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ %3 = bitcast i8 %mask to <8 x i1>
+ ; load needed to keep the operation from being scheduled above the asm block
+ %4 = load <8 x double>, <8 x double>* %passthru
+ %5 = select <8 x i1> %3, <8 x double> %2, <8 x double> %4
+ ; fadd forces execution domain
+ %6 = fadd <8 x double> %5, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+ ret <8 x double> %6
+}
+
+define <8 x double> @stack_fold_permpd_maskz(<8 x double> %a0, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_permpd_maskz
+ ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %4
+}
+
+define <8 x double> @stack_fold_permpdvar(<8 x i64> %a0, <8 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_permpdvar
+ ;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %a1, <8 x i64> %a0, <8 x double> undef, i8 -1)
+ ; fadd forces execution domain
+ %3 = fadd <8 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0, double 0x0>
+ ret <8 x double> %3
+}
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) nounwind readonly
+
+define <16 x float> @stack_fold_permps(<16 x i32> %a0, <16 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_permps
+ ;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %a1, <16 x i32> %a0, <16 x float> undef, i16 -1)
+ ret <16 x float> %2
+}
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) nounwind readonly
+
+define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
+ ;CHECK-LABEL: stack_fold_permq
+ ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ ; add forces execution domain
+ %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_permq_mask
+ ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ %3 = bitcast i8 %mask to <8 x i1>
+ ; load needed to keep the operation from being scheduled above the asm block
+ %4 = load <8 x i64>, <8 x i64>* %passthru
+ %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
+ ; add forces execution domain
+ %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ ret <8 x i64> %6
+}
+
+define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_permq_maskz
+ ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+ %3 = bitcast i8 %mask to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+ ret <8 x i64> %4
+}
+
+define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_permqvar
+ ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
+ ; add forces execution domain
+ %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ ret <8 x i64> %3
+}
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readonly
+
+define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+ ;CHECK-LABEL: stack_fold_permqvar_mask
+ ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
+ %3 = bitcast i8 %mask to <8 x i1>
+ ; load needed to keep the operation from being scheduled above the asm block
+ %4 = load <8 x i64>, <8 x i64>* %passthru
+ %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
+ ; add forces execution domain
+ %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+ ret <8 x i64> %6
+}
+
+define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_permwvar
+ ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+ ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readonly
+
+define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_permwvar_mask
+ ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ ; load needed to keep the operation from being scheduled above the asm block
+ %4 = load <32 x i16>, <32 x i16>* %passthru
+ %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
+ ret <32 x i16> %5
+}
+
+define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_permwvar_maskz
+ ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+ ret <32 x i16> %4
+}
+
+define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_permbvar
+ ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+ ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readonly
+
+define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+ ;CHECK-LABEL: stack_fold_permbvar_mask
+ ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+ %3 = bitcast i64 %mask to <64 x i1>
+ ; load needed to keep the operation from being scheduled above the asm block
+ %4 = load <64 x i8>, <64 x i8>* %passthru
+ %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
+ ret <64 x i8> %5
+}
+
+define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+ ;CHECK-LABEL: stack_fold_permbvar_maskz
+ ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+ %3 = bitcast i64 %mask to <64 x i1>
+ %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
+ ret <64 x i8> %4
+}
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll?rev=288591&r1=288590&r2=288591&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll Sat Dec 3 13:37:39 2016
@@ -1340,3 +1340,85 @@ define <8 x i32> @stack_fold_pmaddwd_ymm
%4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
ret <8 x i32> %4
}
+
+define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
+ ;CHECK-LABEL: stack_fold_permd
+ ;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
+ ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <4 x double> @stack_fold_permpd(<4 x double> %a0) {
+ ;CHECK-LABEL: stack_fold_permpd
+ ;CHECK: vpermpd $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+ ; fadd forces execution domain
+ %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+ ret <4 x double> %3
+}
+
+define <4 x double> @stack_fold_permpdvar(<4 x i64> %a0, <4 x double> %a1) {
+ ;CHECK-LABEL: stack_fold_permpdvar
+ ;CHECK: vpermpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %a1, <4 x i64> %a0, <4 x double> undef, i8 -1)
+ ; fadd forces execution domain
+ %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+ ret <4 x double> %3
+}
+declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) {
+ ;CHECK-LABEL: stack_fold_permps
+ ;CHECK: vpermps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
+ ret <8 x float> %2
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
+ ;CHECK-LABEL: stack_fold_permq
+ ;CHECK: vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+ ; add forces execution domain
+ %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
+ ;CHECK-LABEL: stack_fold_permqvar
+ ;CHECK: vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0, <4 x i64> undef, i8 -1)
+ ; add forces execution domain
+ %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %3
+}
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
+ ;CHECK-LABEL: stack_fold_permwvar
+ ;CHECK: vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0, <16 x i16> undef, i16 -1)
+ ; add forces execution domain
+ %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly
+
+define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
+ ;CHECK-LABEL: stack_fold_permbvar
+ ;CHECK: vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0, <32 x i8> undef, i32 -1)
+ ; add forces execution domain
+ %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) nounwind readonly
More information about the llvm-commits
mailing list