[llvm] 5e42636 - [X86][AVX] Add tests showing failure to use chained PACKSS/PACKUS for multi-stage compaction shuffles

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 3 06:16:26 PDT 2020


Author: Simon Pilgrim
Date: 2020-04-03T14:16:16+01:00
New Revision: 5e426363ba4bd7b5e298be2b2ee4634e7abe7083

URL: https://github.com/llvm/llvm-project/commit/5e426363ba4bd7b5e298be2b2ee4634e7abe7083
DIFF: https://github.com/llvm/llvm-project/commit/5e426363ba4bd7b5e298be2b2ee4634e7abe7083.diff

LOG: [X86][AVX] Add tests showing failure to use chained PACKSS/PACKUS for multi-stage compaction shuffles

The sign/zero extended top bits mean that we could use chained PACK*S ops here

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 3c95f4ce400e..30574e4d4cdf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -590,6 +590,73 @@ define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_7
   ret <64 x i8> %5
 }
 
+define <64 x i8> @shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512BW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX512BW-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrad $25, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vpsrad $25, %zmm0, %zmm2
+; AVX512VBMI-NEXT:    vpsrad $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,64,68,72,76,0,4,8,12,64,68,72,76,16,20,24,28,80,84,88,92,16,20,24,28,80,84,88,92,32,36,40,44,96,100,104,108,32,36,40,44,96,100,104,108,48,52,56,60,112,116,120,124,48,52,56,60,112,116,120,124]
+; AVX512VBMI-NEXT:    vpermi2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    retq
+  %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <64 x i8>
+  %4 = bitcast <16 x i32> %2 to <64 x i8>
+  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32  0, i32  4, i32  8, i32 12, i32  64, i32  68, i32  72, i32  76, i32  0, i32  4, i32  8, i32 12, i32  64, i32  68, i32  72, i32  76, i32 16, i32 20, i32 24, i32 28, i32  80, i32  84, i32  88, i32  92, i32 16, i32 20, i32 24, i32 28, i32  80, i32  84, i32  88, i32  92, i32 32, i32 36, i32 40, i32 44, i32  96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32  96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
+  ret <64 x i8> %5
+}
+
 define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
 ; AVX512F:       # %bb.0:
@@ -634,6 +701,73 @@ define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_7
   ret <64 x i8> %5
 }
 
+define <64 x i8> @shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512BW-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX512BW-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vpsrld $25, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vpsrld $25, %zmm0, %zmm2
+; AVX512VBMI-NEXT:    vpsrld $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,64,68,72,76,0,4,8,12,64,68,72,76,16,20,24,28,80,84,88,92,16,20,24,28,80,84,88,92,32,36,40,44,96,100,104,108,32,36,40,44,96,100,104,108,48,52,56,60,112,116,120,124,48,52,56,60,112,116,120,124]
+; AVX512VBMI-NEXT:    vpermi2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    retq
+  %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %3 = bitcast <16 x i32> %1 to <64 x i8>
+  %4 = bitcast <16 x i32> %2 to <64 x i8>
+  %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32  0, i32  4, i32  8, i32 12, i32  64, i32  68, i32  72, i32  76, i32  0, i32  4, i32  8, i32 12, i32  64, i32  68, i32  72, i32  76, i32 16, i32 20, i32 24, i32 28, i32  80, i32  84, i32  88, i32  92, i32 16, i32 20, i32 24, i32 28, i32  80, i32  84, i32  88, i32  92, i32 32, i32 36, i32 40, i32 44, i32  96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32  96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
+  ret <64 x i8> %5
+}
+
 define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
 ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
 ; AVX512F:       # %bb.0:


        


More information about the llvm-commits mailing list