[llvm] 6ba5fc2 - [X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 11 09:40:20 PDT 2022
Author: Simon Pilgrim
Date: 2022-08-11T17:40:07+01:00
New Revision: 6ba5fc2deedd1a29126ad784bd000974ef139438
URL: https://github.com/llvm/llvm-project/commit/6ba5fc2deedd1a29126ad784bd000974ef139438
DIFF: https://github.com/llvm/llvm-project/commit/6ba5fc2deedd1a29126ad784bd000974ef139438.diff
LOG: [X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets
lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count.
Fixes the remaining regression from the fixes in rG293899c64b75
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-trunc.ll
llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
llvm/test/CodeGen/X86/vector-rotate-128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bea4d58e758ec..d5d41ca50553d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
unsigned EltSizeInBits = VT.getScalarSizeInBits();
unsigned MaxScale = 64 / EltSizeInBits;
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
unsigned NumSrcElts = NumElts / Scale;
unsigned UpperElts = NumElts - NumSrcElts;
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
continue;
+ // Attempt to find a matching source truncation, but as a fall back VLX
+ // cases can use the VPMOV directly.
SDValue Src = peekThroughBitcasts(V1);
- if (Src.getOpcode() != ISD::TRUNCATE ||
- Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getScalarValueSizeInBits() == SrcEltBits) {
+ Src = Src.getOperand(0);
+ } else if (Subtarget.hasVLX()) {
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ Src = DAG.getBitcast(SrcVT, Src);
+ // Don't do this if PACKSS/PACKUS could perform it cheaper.
+ if (Scale == 2 &&
+ ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
+ (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
+ return SDValue();
+ } else
return SDValue();
- Src = Src.getOperand(0);
// VPMOVWB is only available with avx512bw.
- MVT SrcVT = Src.getSimpleValueType();
- if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
- !Subtarget.hasBWI())
+ if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
return SDValue();
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll
index 096ce90742a67..c41c8ca835cf7 100644
--- a/llvm/test/CodeGen/X86/avx512-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc.ll
@@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
;
; SKX-LABEL: trunc_qw_128:
; SKX: ## %bb.0:
-; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT: vpmovqw %xmm0, %xmm0
; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i16>
ret <2 x i16> %x
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
index 60364e638d8a8..54e62066d2eea 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
@@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
; CHECK-LABEL: test_u1tofp2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: vpmovqw %xmm0, %xmm0
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 3ac5ce820053b..18dbb3ffec159 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
; AVX512-LABEL: vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpmovdw %xmm0, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm1, (%rsi)
-; AVX512-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512-NEXT: vmovq %xmm1, (%rdx)
; AVX512-NEXT: retq
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index e16668a96bd17..d06ab7caec3b6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
; AVX512-LABEL: vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpmovqw %xmm0, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovd %xmm1, (%rsi)
-; AVX512-NEXT: vmovd %xmm2, (%rdx)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512-NEXT: vmovd %xmm1, (%rdx)
; AVX512-NEXT: vmovd %xmm3, (%rcx)
-; AVX512-NEXT: vmovd %xmm0, (%r8)
+; AVX512-NEXT: vmovd %xmm2, (%r8)
; AVX512-NEXT: retq
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 8ed4d83fcf8d8..0cd027aa05154 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: retq
;
-; AVX512-LABEL: rot16_trunc:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1
-; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: retq
+; AVX512NOVLX-LABEL: rot16_trunc:
+; AVX512NOVLX: # %bb.0:
+; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1
+; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512NOVLX-NEXT: retq
+;
+; AVX512VLX-LABEL: rot16_trunc:
+; AVX512VLX: # %bb.0:
+; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1
+; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0
+; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0
+; AVX512VLX-NEXT: retq
;
; XOP-LABEL: rot16_trunc:
; XOP: # %bb.0:
More information about the llvm-commits
mailing list