[llvm] e652c0f - [X86] Teach lowerShuffleAsBlend to use bit blend for v16i8/v32i8/v16i16 when avx512vl is enabled but not avx512bw.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 4 10:27:59 PDT 2020
Author: Craig Topper
Date: 2020-07-04T10:26:56-07:00
New Revision: e652c0f8f3e7c7a1b42edf22cfc5bbfd597fd164
URL: https://github.com/llvm/llvm-project/commit/e652c0f8f3e7c7a1b42edf22cfc5bbfd597fd164
DIFF: https://github.com/llvm/llvm-project/commit/e652c0f8f3e7c7a1b42edf22cfc5bbfd597fd164.diff
LOG: [X86] Teach lowerShuffleAsBlend to use bit blend for v16i8/v32i8/v16i16 when avx512vl is enabled but not avx512bw.
Probably not super important since there are no real CPUs with
avx512vl and not avx512bw. But vpternlog should be better than
vblendvb.
I do wonder if we should use vpternlog even with BWI. We
currently use vblendmb or vpblendmw by putting the mask into a GPR
and moving it to a k-register. But I don't think we hoist the
GPR to k-register copy in machine LICM. Using VPTERNLOG would use
a constant pool load, but has the advantage that we're pretty good
at hoisting and rematerializing those.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D83156
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-shuffle-v48.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 83ad55a94fa9..88a563720c2a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11731,6 +11731,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
+ // If we have VPTERNLOG, we can use that as a bit blend.
+ if (Subtarget.hasVLX())
+ if (SDValue BitBlend =
+ lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 464723172dad..0706254f4e5c 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -148,18 +148,17 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX256VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17]
+; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13],zero,zero,ymm1[8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23],zero,zero,ymm1[30,31,16,17]
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX256VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1]
-; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
-; AVX256VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm2
-; AVX256VL-NEXT: vpslld $31, %ymm2, %ymm2
-; AVX256VL-NEXT: vptestmd %ymm2, %ymm2, %k1
-; AVX256VL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX256VL-NEXT: vpternlogq $220, {{.*}}(%rip), %ymm1, %ymm2
+; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1
+; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
+; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1
; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index d8f1e54b959c..6b49f22f21f1 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1345,10 +1345,11 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
;
; AVX512VL-LABEL: negative:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3]
; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 7cd454f9dc96..b2ad1b33384e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -2905,8 +2905,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm2, %zmm2
; AVX512VL-NEXT: vpord %zmm1, %zmm2, %zmm1
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $216, %xmm2, %xmm1, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 678ae18b5f33..674b064100c4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -2376,8 +2376,8 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $216, %ymm2, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 35ba0e812585..23fbc5e70707 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2651,9 +2651,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $202, %xmm1, %xmm2, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index ad29afec0958..bd5698bc63be 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -2083,9 +2083,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
index 5cd35af8cc30..e85492325825 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -52,17 +52,16 @@ define <32 x i8> @foo(<48 x i8>* %x0) {
;
; AVX512F-LABEL: foo:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm0
-; AVX512F-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: foo:
More information about the llvm-commits
mailing list