[llvm] r295154 - [AVX-512] Add PACKSS/PACKUS instructions to load folding tables.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 14 22:51:40 PST 2017


Author: ctopper
Date: Wed Feb 15 00:51:39 2017
New Revision: 295154

URL: http://llvm.org/viewvc/llvm-project?rev=295154&view=rev
Log:
[AVX-512] Add PACKSS/PACKUS instructions to load folding tables.

Modified:
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
    llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
    llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=295154&r1=295153&r2=295154&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Wed Feb 15 00:51:39 2017
@@ -1894,6 +1894,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
     { X86::VORPDZrr,          X86::VORPDZrm,            0 },
     { X86::VORPSZrr,          X86::VORPSZrm,            0 },
+    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
+    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
+    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
+    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
     { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
@@ -2068,6 +2072,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
     { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
     { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
+    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
+    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
+    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
+    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
+    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
+    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
+    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
+    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
     { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
     { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
     { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
@@ -2562,6 +2574,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
     { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
+    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
+    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
+    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
+    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
     { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
     { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
     { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
@@ -2687,6 +2703,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
     { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
     { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
+    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
+    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
+    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
+    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
     { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
     { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
     { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
@@ -2806,6 +2826,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
     { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
     { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
+    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
+    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
+    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
+    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
     { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
     { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
     { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
@@ -3069,6 +3093,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
     { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
+    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
+    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
+    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
+    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
     { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
     { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
     { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
@@ -3207,6 +3235,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
     { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
     { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
+    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
+    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
+    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
+    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
     { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
     { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
     { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
@@ -3340,6 +3372,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget
     { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
     { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
     { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
+    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
+    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
+    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
+    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
     { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
     { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
     { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },

Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll?rev=295154&r1=295153&r2=295154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll Wed Feb 15 00:51:39 2017
@@ -278,6 +278,59 @@ define <32 x i16> @stack_fold_pabsw_mask
   ret <32 x i16> %2
 }
 
+define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) nounwind readnone
+
+define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a0, <32 x i16> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64) nounwind readnone
+
+define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) nounwind readnone
+
+define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_packusdw_mask
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x i16>, <32 x i16>* %passthru
+  %3 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> %2, i32 %mask)
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_packusdw_maskz
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %2
+}
+
+define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a0, <32 x i16> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64) nounwind readnone
+
 define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_paddb
   ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload

Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll?rev=295154&r1=295153&r2=295154&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512vl.ll Wed Feb 15 00:51:39 2017
@@ -193,6 +193,78 @@ define <16 x i16> @stack_fold_pabsw_ymm(
 }
 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
 
+define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw_ymm
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb_ymm
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw_ymm
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb_ymm
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
 define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_paddb
   ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload




More information about the llvm-commits mailing list