[llvm] f42de69 - [X86] vector-shuffle-512-v16.ll - add fast shuffle test coverage
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed May 29 02:38:25 PDT 2024
Author: Simon Pilgrim
Date: 2024-05-29T10:38:03+01:00
New Revision: f42de69213890f1203c1c3418a962e50de4ed73c
URL: https://github.com/llvm/llvm-project/commit/f42de69213890f1203c1c3418a962e50de4ed73c
DIFF: https://github.com/llvm/llvm-project/commit/f42de69213890f1203c1c3418a962e50de4ed73c.diff
LOG: [X86] vector-shuffle-512-v16.ll - add fast shuffle test coverage
Added:
Modified:
llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index c981d973fef3e..bad0b411f68a9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefixes=ALL,SLOW,AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,FAST,AVX512BW
target triple = "x86_64-unknown-unknown"
@@ -14,21 +16,33 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
}
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT: vbroadcastss %xmm0, %zmm0
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
+; FAST: # %bb.0:
+; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x float> %shuffle
}
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; ALL-NEXT: vbroadcastss %xmm0, %zmm0
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; FAST: # %bb.0:
+; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%tmp0 = bitcast <16 x i32> %a to <16 x float>
%tmp1 = bitcast <16 x i32> %b to <16 x float>
%shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -196,11 +210,20 @@ define <16 x float> @shuffle_f32_v16f32_00_08_01_09_02_10_03_11_04_12_05_13_06_1
; PR86076
define <16 x float> @shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08(float %a0, float %a1) {
-; ALL-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
-; ALL: # %bb.0:
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; SLOW-NEXT: vbroadcastsd %xmm0, %zmm0
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_f32_v16f32_00_08_00_08_00_08_00_08_00_08_00_08_00_08_00_08:
+; FAST: # %bb.0:
+; FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16]
+; FAST-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
+; FAST-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; FAST-NEXT: retq
%v0 = insertelement <8 x float> poison, float %a0, i64 0
%v1 = insertelement <8 x float> poison, float %a1, i64 0
%sv = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
@@ -217,11 +240,17 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
}
define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
-; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vbroadcastss %xmm0, %zmm0
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST: # %bb.0:
+; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <16 x i32> %shuffle
}
@@ -302,21 +331,33 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
; PR46249
define <16 x i32> @shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x i32> %a) {
-; ALL-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL: # %bb.0:
-; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16i32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST: # %bb.0:
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%1 = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <16 x i32> %1
}
define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) {
-; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
-; ALL: # %bb.0:
-; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SLOW-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3]
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04:
+; FAST: # %bb.0:
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <16 x float> %1
}
@@ -333,11 +374,17 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
}
define <16 x float> @shuffle_v16f32_load_08_11_10_00_12_15_14_04(<16 x float> %a0, ptr %a1) {
-; ALL-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
-; ALL: # %bb.0:
-; ALL-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
-; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vshufps {{.*#+}} zmm1 = zmm0[2,0],mem[0,0],zmm0[6,4],mem[4,4],zmm0[10,8],mem[8,8],zmm0[14,12],mem[12,12]
+; SLOW-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,2],zmm0[4,7],zmm1[4,6],zmm0[8,11],zmm1[8,10],zmm0[12,15],zmm1[12,14]
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v16f32_load_08_11_10_00_12_15_14_04:
+; FAST: # %bb.0:
+; FAST-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,2,16,4,7,6,20,8,11,10,24,12,15,14,28]
+; FAST-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
+; FAST-NEXT: retq
%1 = load <16 x float>, ptr %a1
%2 = shufflevector <16 x float> %1, <16 x float> %a0, <16 x i32> <i32 16, i32 19, i32 18, i32 0, i32 20, i32 23, i32 22, i32 4, i32 24, i32 27, i32 26, i32 8, i32 28, i32 31, i32 30, i32 12>
ret <16 x float> %2
@@ -365,26 +412,41 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
;FIXME: can do better with vpcompress
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; ALL-NEXT: retq
+; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; FAST: # %bb.0:
+; FAST-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; FAST-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x i32> %res
}
;FIXME: can do better with vpcompress
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
-; ALL-LABEL: test_v16i32_0_1_2_12:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vbroadcastss %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; SLOW-LABEL: test_v16i32_0_1_2_12:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
+; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; SLOW-NEXT: vzeroupper
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: test_v16i32_0_1_2_12:
+; FAST: # %bb.0:
+; FAST-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,2,12]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FAST-NEXT: vzeroupper
+; FAST-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
ret <4 x i32> %res
}
@@ -568,11 +630,18 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
}
define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
-; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vbroadcastss %xmm0, %zmm0
-; ALL-NEXT: retq
+; SLOW-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; SLOW-NEXT: vbroadcastss %xmm0, %zmm0
+; SLOW-NEXT: retq
+;
+; FAST-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
+; FAST: # %bb.0:
+; FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; FAST-NEXT: vbroadcastss {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <16 x float> %shuffle
}
More information about the llvm-commits
mailing list