[PATCH] D68763: [X86] Use packusdw+vpmovuswb to implement v16i32->V16i8 that clamps signed inputs to be between 0 and 255 when zmm registers are disabled on SKX.
Craig Topper via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 12:47:56 PDT 2019
This revision was automatically updated to reflect the committed changes.
Closed by commit rG0e561437c587: [X86] Use packusdw+vpmovuswb to implement v16i32->V16i8 that clamps signed… (authored by craig.topper).
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D68763/new/
https://reviews.llvm.org/D68763
Files:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/min-legal-vector-width.ll
Index: llvm/test/CodeGen/X86/min-legal-vector-width.ll
===================================================================
--- llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1083,12 +1083,10 @@
define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_packus_v16i32_v16i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm1
-; CHECK-NEXT: vpmovusdb %ymm1, %xmm1
-; CHECK-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0
-; CHECK-NEXT: vpmovusdb %ymm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; CHECK-NEXT: vpmovuswb %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %p
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39841,6 +39841,21 @@
if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
}
+
+ // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
+ // split across two registers. We can use a packusdw+perm to clamp to 0-65535
+ // and concatenate at the same time. Then we can use a final vpmovuswb to
+ // clip to 0-255.
+ if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ InVT == MVT::v16i32 && VT == MVT::v16i8) {
+ if (auto USatVal = detectSSatPattern(In, VT, true)) {
+ // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
+ SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
+ DL, DAG, Subtarget);
+ return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
+ }
+ }
+
if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
!(Subtarget.hasAVX512() && InSVT == MVT::i32) &&
!(Subtarget.hasBWI() && InSVT == MVT::i16) &&
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D68763.224452.patch
Type: text/x-patch
Size: 2231 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20191010/08393118/attachment.bin>
More information about the llvm-commits
mailing list