[llvm] ac534d2 - [X86] combineArithReduction - use PACKUSWB directly for PSADBW(TRUNCATE(v8i16 X)) reduction patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 25 06:57:14 PDT 2023
Author: Simon Pilgrim
Date: 2023-10-25T14:56:58+01:00
New Revision: ac534d2a16bb3c77eed050f1d43d4f87349ca097
URL: https://github.com/llvm/llvm-project/commit/ac534d2a16bb3c77eed050f1d43d4f87349ca097
DIFF: https://github.com/llvm/llvm-project/commit/ac534d2a16bb3c77eed050f1d43d4f87349ca097.diff
LOG: [X86] combineArithReduction - use PACKUSWB directly for PSADBW(TRUNCATE(v8i16 X)) reduction patterns
Avoids a crash in the D152928 patch due to a reduction pattern appearing after legalization
We can probably extend this further to avoid truncating to sub-128-bit vXi8 (and then calling WidenToV16I8) entirely, but we can't currently hit other cases.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20e0210bcec5b6a..93db31e03e116e7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43935,10 +43935,15 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
Subtarget.hasAVX512())) {
- EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
- Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
- if (ByteVT.getSizeInBits() < 128)
- Rdx = WidenToV16I8(Rdx, true);
+ if (Rdx.getValueType() == MVT::v8i16) {
+ Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
+ DAG.getUNDEF(MVT::v8i16));
+ } else {
+ EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
+ Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
+ if (ByteVT.getSizeInBits() < 128)
+ Rdx = WidenToV16I8(Rdx, true);
+ }
// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index a1efa9d150346b6..f8e3a7a23056fb4 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1949,79 +1949,31 @@ define void @PR34773(ptr %a0, ptr %a1) {
}
define i16 @PR66194(i8 %q) {
-; SSE2-LABEL: PR66194:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: xorl %ecx, %ecx
-; SSE2-NEXT: testb %dil, %dil
-; SSE2-NEXT: setne %al
-; SSE2-NEXT: sete %cl
-; SSE2-NEXT: movl %ecx, %edx
-; SSE2-NEXT: shll $16, %edx
-; SSE2-NEXT: orl %eax, %edx
-; SSE2-NEXT: movd %edx, %xmm0
-; SSE2-NEXT: pinsrw $2, %eax, %xmm0
-; SSE2-NEXT: pinsrw $3, %eax, %xmm0
-; SSE2-NEXT: pinsrw $4, %ecx, %xmm0
-; SSE2-NEXT: pinsrw $5, %eax, %xmm0
-; SSE2-NEXT: pinsrw $6, %eax, %xmm0
-; SSE2-NEXT: pinsrw $7, %ecx, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: psubw %xmm1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: PR66194:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: xorl %ecx, %ecx
-; SSSE3-NEXT: testb %dil, %dil
-; SSSE3-NEXT: setne %al
-; SSSE3-NEXT: sete %cl
-; SSSE3-NEXT: movl %ecx, %edx
-; SSSE3-NEXT: shll $16, %edx
-; SSSE3-NEXT: orl %eax, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: pinsrw $2, %eax, %xmm0
-; SSSE3-NEXT: pinsrw $3, %eax, %xmm0
-; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
-; SSSE3-NEXT: pinsrw $5, %eax, %xmm0
-; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
-; SSSE3-NEXT: pinsrw $7, %ecx, %xmm0
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: psubw %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: shll $8, %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: orl %eax, %ecx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: shll $8, %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: orl %eax, %edx
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: shll $16, %eax
-; SSSE3-NEXT: orl %edx, %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT: shll $24, %edx
-; SSSE3-NEXT: orl %eax, %edx
-; SSSE3-NEXT: movd %edx, %xmm0
-; SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT: shll $8, %eax
-; SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT: orl %eax, %ecx
-; SSSE3-NEXT: pinsrw $3, %ecx, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: psadbw %xmm0, %xmm1
-; SSSE3-NEXT: movd %xmm1, %eax
-; SSSE3-NEXT: # kill: def $ax killed $ax killed $eax
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: PR66194:
+; SSE2-SSSE3: # %bb.0: # %entry
+; SSE2-SSSE3-NEXT: xorl %eax, %eax
+; SSE2-SSSE3-NEXT: xorl %ecx, %ecx
+; SSE2-SSSE3-NEXT: testb %dil, %dil
+; SSE2-SSSE3-NEXT: setne %al
+; SSE2-SSSE3-NEXT: sete %cl
+; SSE2-SSSE3-NEXT: movl %ecx, %edx
+; SSE2-SSSE3-NEXT: shll $16, %edx
+; SSE2-SSSE3-NEXT: orl %eax, %edx
+; SSE2-SSSE3-NEXT: movd %edx, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $2, %eax, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $3, %eax, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $4, %ecx, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $5, %eax, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-SSSE3-NEXT: pinsrw $7, %ecx, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: psubw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: psadbw %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movd %xmm1, %eax
+; SSE2-SSSE3-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: PR66194:
; SSE41: # %bb.0: # %entry
@@ -2040,7 +1992,7 @@ define i16 @PR66194(i8 %q) {
; SSE41-NEXT: pinsrb $14, %ecx, %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm1, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: packuswb %xmm0, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psadbw %xmm0, %xmm1
; SSE41-NEXT: movd %xmm1, %eax
More information about the llvm-commits
mailing list