[llvm] e8305c0 - [X86] combineX86ShuffleChain - don't fold to truncate(concat(V1,V2)) if it was already a PACK op

Mon Apr 25 09:13:56 PDT 2022

Author: Simon Pilgrim
Date: 2022-04-25T17:13:44+01:00
New Revision: e8305c0b8f492272446d517e67d451f390d35dbe

URL: https://github.com/llvm/llvm-project/commit/e8305c0b8f492272446d517e67d451f390d35dbe
DIFF: https://github.com/llvm/llvm-project/commit/e8305c0b8f492272446d517e67d451f390d35dbe.diff

LOG: [X86] combineX86ShuffleChain - don't fold to truncate(concat(V1,V2)) if it was already a PACK op

Fixes #55050

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3120cd5483c5b..de0eb3b4c09c5 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37801,7 +37801,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
         (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
         isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
-      if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+      // Bail if this was already a truncation or PACK node.
+      // We sometimes fail to match PACK if we demand known undef elements.
+      if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
+                         Root.getOpcode() == X86ISD::PACKSS ||
+                         Root.getOpcode() == X86ISD::PACKUS))
         return SDValue(); // Nothing to do!
       ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
       ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 9fb4252b3e71d..295e3305f5b33 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -175,3 +175,47 @@ define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) {
   %sel = select <8 x i1> %mask, <8 x i32> %shl, <8 x i32> zeroinitializer
   ret <8 x i32> %sel
 }
+
+define i64 @PR55050() {
+; X86-LABEL: PR55050:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    jne .LBB10_2
+; X86-NEXT:  # %bb.1: # %if
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB10_2: # %exit
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR55050:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+entry:
+  %i275 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
+  %i277 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> undef, <16 x i8> zeroinitializer)
+  br i1 undef, label %exit, label %if
+
+if:
+  %i298 = bitcast <2 x i64> %i275 to <4 x i32>
+  %i299 = bitcast <2 x i64> %i277 to <4 x i32>
+  %i300 = shufflevector <4 x i32> %i298, <4 x i32> %i299, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %i339 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %i300, <4 x i32> undef)
+  %i354 = shufflevector <8 x i16> %i339, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef>
+  %i356 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %i354, <8 x i16> undef)
+  %i357 = shufflevector <16 x i8> %i356, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 5, i32 4, i32 16, i32 2, i32 1, i32 0, i32 16, i32 10, i32 9, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %i361 = extractelement <16 x i8> %i357, i64 8
+  %i360 = and i8 %i361, 63
+  %i379 = zext i8 %i360 to i64
+  br label %exit
+
+exit:
+  %res = phi i64 [ %i379, %if ], [ 0, %entry ]
+  ret i64 %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)