[llvm] 5e42636 - [X86][AVX] Add tests showing failure to use chained PACKSS/PACKUS for multi-stage compaction shuffles
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 3 06:16:26 PDT 2020
Author: Simon Pilgrim
Date: 2020-04-03T14:16:16+01:00
New Revision: 5e426363ba4bd7b5e298be2b2ee4634e7abe7083
URL: https://github.com/llvm/llvm-project/commit/5e426363ba4bd7b5e298be2b2ee4634e7abe7083
DIFF: https://github.com/llvm/llvm-project/commit/5e426363ba4bd7b5e298be2b2ee4634e7abe7083.diff
LOG: [X86][AVX] Add tests showing failure to use chained PACKSS/PACKUS for multi-stage compaction shuffles
The sign/zero extended top bits mean that we could use chained PACK*S ops here
Added:
Modified:
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 3c95f4ce400e..30574e4d4cdf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -590,6 +590,73 @@ define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_7
ret <64 x i8> %5
}
+define <64 x i8> @shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_92_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm2
+; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,64,68,72,76,0,4,8,12,64,68,72,76,16,20,24,28,80,84,88,92,16,20,24,28,80,84,88,92,32,36,40,44,96,100,104,108,32,36,40,44,96,100,104,108,48,52,56,60,112,116,120,124,48,52,56,60,112,116,120,124]
+; AVX512VBMI-NEXT: vpermi2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: retq
+ %1 = ashr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+ %2 = ashr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+ %3 = bitcast <16 x i32> %1 to <64 x i8>
+ %4 = bitcast <16 x i32> %2 to <64 x i8>
+ %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
+ ret <64 x i8> %5
+}
+
define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125:
; AVX512F: # %bb.0:
@@ -634,6 +701,73 @@ define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_7
ret <64 x i8> %5
}
+define <64 x i8> @shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124(<16 x i32> %a0, <16 x i32> %a1) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512F-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512F-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12>
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u,0,4,8,12,u,u,u,u>
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_04_08_12_64_68_72_76_00_04_08_12_64_68_72_76_16_20_24_28_80_84_88_092_16_20_24_28_80_84_88_92_32_36_40_44_96_100_104_108_32_36_40_44_96_100_104_108_48_52_56_60_112_116_120_124_48_52_56_60_112_116_120_124:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm2
+; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4,8,12,64,68,72,76,0,4,8,12,64,68,72,76,16,20,24,28,80,84,88,92,16,20,24,28,80,84,88,92,32,36,40,44,96,100,104,108,32,36,40,44,96,100,104,108,48,52,56,60,112,116,120,124,48,52,56,60,112,116,120,124]
+; AVX512VBMI-NEXT: vpermi2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: retq
+ %1 = lshr <16 x i32> %a0, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+ %2 = lshr <16 x i32> %a1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+ %3 = bitcast <16 x i32> %1 to <64 x i8>
+ %4 = bitcast <16 x i32> %2 to <64 x i8>
+ %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 0, i32 4, i32 8, i32 12, i32 64, i32 68, i32 72, i32 76, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 16, i32 20, i32 24, i32 28, i32 80, i32 84, i32 88, i32 92, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 32, i32 36, i32 40, i32 44, i32 96, i32 100, i32 104, i32 108, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124, i32 48, i32 52, i32 56, i32 60, i32 112, i32 116, i32 120, i32 124>
+ ret <64 x i8> %5
+}
+
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
; AVX512F: # %bb.0:
More information about the llvm-commits
mailing list