[llvm] 9cdba33 - [X86] combineX86ShufflesRecursively - determine demanded elts to pass to getTargetShuffleInputs
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 31 03:33:04 PDT 2022
Author: Simon Pilgrim
Date: 2022-07-31T11:30:40+01:00
New Revision: 9cdba33337423f0083680c8f00251cc484b3898d
URL: https://github.com/llvm/llvm-project/commit/9cdba33337423f0083680c8f00251cc484b3898d
DIFF: https://github.com/llvm/llvm-project/commit/9cdba33337423f0083680c8f00251cc484b3898d.diff
LOG: [X86] combineX86ShufflesRecursively - determine demanded elts to pass to getTargetShuffleInputs
Only PACKSS/PACKUS faux shuffles make use of the demanded elts at the moment, but this at least improves the handling of a couple of truncation patterns.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fptoui-may-overflow.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 508658ded4a0e..f74d7e1896e08 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39002,12 +39002,31 @@ static SDValue combineX86ShufflesRecursively(
assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
"Can only combine shuffles upto size of the root op.");
+ // Create a demanded elts mask from the referenced elements of Op.
+ APInt OpDemandedElts = APInt::getZero(RootMask.size());
+ for (int M : RootMask) {
+ int BaseIdx = RootMask.size() * SrcOpIndex;
+ if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
+ OpDemandedElts.setBit(M - BaseIdx);
+ }
+ if (RootSizeInBits != VT.getSizeInBits()) {
+ // Op is smaller than Root - extract the demanded elts for the subvector.
+ unsigned Scale = RootSizeInBits / VT.getSizeInBits();
+ unsigned NumOpMaskElts = RootMask.size() / Scale;
+ assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
+ assert(OpDemandedElts
+ .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
+ .isZero() &&
+ "Out of range elements referenced in root mask");
+ OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
+ }
+ OpDemandedElts =
+ APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
+
// Extract target shuffle mask and resolve sentinels and inputs.
- // TODO - determine Op's demanded elts from RootMask.
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
APInt OpUndef, OpZero;
- APInt OpDemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
OpZero, DAG, Depth, false)) {
diff --git a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
index 37bdfaadaf57a..ec53704289d19 100644
--- a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
+++ b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll
@@ -9,9 +9,7 @@ define <16 x i8> @fptoui_zext(<4 x float> %arg) {
; CHECK-LABEL: fptoui_zext:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%f = fptoui <4 x float> %arg to <4 x i8>
%z = zext <4 x i8> %f to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 2d48f30128e0e..7710186c8850b 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -3715,14 +3715,22 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_usat_v4i32_v4i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_usat_v4i32_v4i8_store:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-SLOW-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_usat_v4i32_v4i8_store:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-FAST-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v4i32_v4i8_store:
; AVX512F: # %bb.0:
More information about the llvm-commits
mailing list