[llvm] r287974 - [AVX-512] Add masked 128/256-bit integer add/sub instructions to load folding tables.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Nov 26 00:21:49 PST 2016
Author: ctopper
Date: Sat Nov 26 02:21:48 2016
New Revision: 287974
URL: http://llvm.org/viewvc/llvm-project?rev=287974&view=rev
Log:
[AVX-512] Add masked 128/256-bit integer add/sub instructions to load folding tables.
Modified:
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=287974&r1=287973&r2=287974&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Sat Nov 26 02:21:48 2016
@@ -2191,12 +2191,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
{ X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
{ X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
+ { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
+ { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
+ { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
+ { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
+ { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
+ { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
+ { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
+ { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
{ X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
{ X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
{ X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
{ X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
{ X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
{ X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
+ { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
+ { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
+ { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
+ { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
+ { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
+ { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
+ { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
+ { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
{ X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
{ X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
{ X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
@@ -2225,12 +2241,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
{ X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
{ X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
+ { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
+ { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
+ { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
+ { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
+ { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
+ { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
+ { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
+ { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
{ X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
{ X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
{ X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
{ X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
{ X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
{ X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
+ { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
+ { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
+ { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
+ { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
+ { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
+ { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
+ { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
+ { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
{ X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
{ X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
{ X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
@@ -2328,12 +2360,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
{ X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
{ X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
+ { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
+ { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
+ { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
+ { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
+ { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
+ { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
+ { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
+ { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
{ X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
{ X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
{ X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
{ X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
{ X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
+ { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
+ { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
+ { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
+ { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
+ { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
+ { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
+ { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
+ { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
{ X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
{ X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
@@ -2366,12 +2414,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
{ X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
{ X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
{ X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
+ { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
+ { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
+ { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
+ { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
+ { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
+ { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
+ { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
+ { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
{ X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
{ X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
{ X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
{ X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
+ { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
+ { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
+ { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
+ { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
+ { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
+ { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
+ { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
+ { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
{ X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
{ X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll?rev=287974&r1=287973&r2=287974&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll Sat Nov 26 02:21:48 2016
@@ -16,6 +16,28 @@ define <16 x i8> @stack_fold_paddb(<16 x
ret <16 x i8> %2
}
+define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_paddb_mask
+ ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = add <16 x i8> %a0, %a1
+ %3 = bitcast i16 %mask to <16 x i1>
+ ; load needed to keep the operation from being scheduled about the asm block
+ %4 = load <16 x i8>, <16 x i8>* %a2
+ %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
+ ret <16 x i8> %5
+}
+
+define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+ ;CHECK-LABEL: stack_fold_paddb_maskz
+ ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = add <16 x i8> %a0, %a1
+ %3 = bitcast i16 %mask to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
+ ret <16 x i8> %4
+}
+
define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_paddb_ymm
;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
@@ -24,6 +46,28 @@ define <32 x i8> @stack_fold_paddb_ymm(<
ret <32 x i8> %2
}
+define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_paddb_mask_ymm
+ ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = add <32 x i8> %a0, %a1
+ %3 = bitcast i32 %mask to <32 x i1>
+ ; load needed to keep the operation from being scheduled about the asm block
+ %4 = load <32 x i8>, <32 x i8>* %a2
+ %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
+ ret <32 x i8> %5
+}
+
+define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+ ;CHECK-LABEL: stack_fold_paddb_maskz_ymm
+ ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = add <32 x i8> %a0, %a1
+ %3 = bitcast i32 %mask to <32 x i1>
+ %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
+ ret <32 x i8> %4
+}
+
define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_paddd
;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
More information about the llvm-commits
mailing list