[llvm] ac534d2 - [X86] combineArithReduction - use PACKUSWB directly for PSADBW(TRUNCATE(v8i16 X)) reduction patterns

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 25 06:57:14 PDT 2023


Author: Simon Pilgrim
Date: 2023-10-25T14:56:58+01:00
New Revision: ac534d2a16bb3c77eed050f1d43d4f87349ca097

URL: https://github.com/llvm/llvm-project/commit/ac534d2a16bb3c77eed050f1d43d4f87349ca097
DIFF: https://github.com/llvm/llvm-project/commit/ac534d2a16bb3c77eed050f1d43d4f87349ca097.diff

LOG: [X86] combineArithReduction - use PACKUSWB directly for PSADBW(TRUNCATE(v8i16 X)) reduction patterns

Avoids a crash in the D152928 patch due to a reduction pattern appearing after legalization

We can probably extend this further to avoid truncating to sub-128-bit vXi8 (and then calling WidenToV16I8) entirely, but we can't currently hit other cases.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-trunc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 20e0210bcec5b6a..93db31e03e116e7 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43935,10 +43935,15 @@ static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,
       DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
       (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
        Subtarget.hasAVX512())) {
-    EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
-    Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
-    if (ByteVT.getSizeInBits() < 128)
-      Rdx = WidenToV16I8(Rdx, true);
+    if (Rdx.getValueType() == MVT::v8i16) {
+      Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
+                        DAG.getUNDEF(MVT::v8i16));
+    } else {
+      EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
+      Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
+      if (ByteVT.getSizeInBits() < 128)
+        Rdx = WidenToV16I8(Rdx, true);
+    }
 
     // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
     auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

diff  --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index a1efa9d150346b6..f8e3a7a23056fb4 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1949,79 +1949,31 @@ define void @PR34773(ptr %a0, ptr %a1) {
 }
 
 define i16 @PR66194(i8 %q) {
-; SSE2-LABEL: PR66194:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %ecx, %ecx
-; SSE2-NEXT:    testb %dil, %dil
-; SSE2-NEXT:    setne %al
-; SSE2-NEXT:    sete %cl
-; SSE2-NEXT:    movl %ecx, %edx
-; SSE2-NEXT:    shll $16, %edx
-; SSE2-NEXT:    orl %eax, %edx
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
-; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
-; SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSE2-NEXT:    pinsrw $5, %eax, %xmm0
-; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE2-NEXT:    pinsrw $7, %ecx, %xmm0
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR66194:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    xorl %eax, %eax
-; SSSE3-NEXT:    xorl %ecx, %ecx
-; SSSE3-NEXT:    testb %dil, %dil
-; SSSE3-NEXT:    setne %al
-; SSSE3-NEXT:    sete %cl
-; SSSE3-NEXT:    movl %ecx, %edx
-; SSSE3-NEXT:    shll $16, %edx
-; SSSE3-NEXT:    orl %eax, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
-; SSSE3-NEXT:    pinsrw $3, %eax, %xmm0
-; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSSE3-NEXT:    pinsrw $5, %eax, %xmm0
-; SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
-; SSSE3-NEXT:    pinsrw $7, %ecx, %xmm0
-; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT:    psubw %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    shll $8, %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    orl %eax, %ecx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    shll $8, %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    orl %eax, %edx
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    shll $16, %eax
-; SSSE3-NEXT:    orl %edx, %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSSE3-NEXT:    shll $24, %edx
-; SSSE3-NEXT:    orl %eax, %edx
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    pinsrw $2, %ecx, %xmm0
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSSE3-NEXT:    shll $8, %eax
-; SSSE3-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSSE3-NEXT:    orl %eax, %ecx
-; SSSE3-NEXT:    pinsrw $3, %ecx, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psadbw %xmm0, %xmm1
-; SSSE3-NEXT:    movd %xmm1, %eax
-; SSSE3-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSSE3-NEXT:    retq
+; SSE2-SSSE3-LABEL: PR66194:
+; SSE2-SSSE3:       # %bb.0: # %entry
+; SSE2-SSSE3-NEXT:    xorl %eax, %eax
+; SSE2-SSSE3-NEXT:    xorl %ecx, %ecx
+; SSE2-SSSE3-NEXT:    testb %dil, %dil
+; SSE2-SSSE3-NEXT:    setne %al
+; SSE2-SSSE3-NEXT:    sete %cl
+; SSE2-SSSE3-NEXT:    movl %ecx, %edx
+; SSE2-SSSE3-NEXT:    shll $16, %edx
+; SSE2-SSSE3-NEXT:    orl %eax, %edx
+; SSE2-SSSE3-NEXT:    movd %edx, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $3, %eax, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $5, %eax, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE2-SSSE3-NEXT:    pinsrw $7, %ecx, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-SSSE3-NEXT:    psubw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE2-SSSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    movd %xmm1, %eax
+; SSE2-SSSE3-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE2-SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: PR66194:
 ; SSE41:       # %bb.0: # %entry
@@ -2040,7 +1992,7 @@ define i16 @PR66194(i8 %q) {
 ; SSE41-NEXT:    pinsrb $14, %ecx, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE41-NEXT:    psubw %xmm1, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    packuswb %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    psadbw %xmm0, %xmm1
 ; SSE41-NEXT:    movd %xmm1, %eax


        


More information about the llvm-commits mailing list