[llvm] 228442a - [X86] canonicalizeShuffleMaskWithHorizOp - fold 256-bit permute(hop(x,y)) -> hop(extract(x),extract(x)) iff we only demand the lower elements

Thu Jul 13 02:37:24 PDT 2023

Author: Simon Pilgrim
Date: 2023-07-13T10:37:08+01:00
New Revision: 228442a14ceb804d74416a7701b012ffeb759aff

URL: https://github.com/llvm/llvm-project/commit/228442a14ceb804d74416a7701b012ffeb759aff
DIFF: https://github.com/llvm/llvm-project/commit/228442a14ceb804d74416a7701b012ffeb759aff.diff

LOG: [X86] canonicalizeShuffleMaskWithHorizOp - fold 256-bit permute(hop(x,y)) -> hop(extract(x),extract(x)) iff we only demand the lower elements

Attempt to recognise when we can narrow a 256-bit hop to a lower 128-bit hop by extracting the requested subvectors (and then widening back)

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 10c2d98eafec40..e4d74eccd2c3dc 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41100,6 +41100,25 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
     }
   }
 
+  // If we are post-shuffling a 256-bit hop and not requiring the upper
+  // elements, then try to narrow to a 128-bit hop directly.
+  SmallVector<int, 16> WideMask64;
+  if (Ops.size() == 1 && NumLanes == 2 &&
+      scaleShuffleElements(Mask, 4, WideMask64) &&
+      isUndefInRange(WideMask64, 2, 2)) {
+    int M0 = WideMask64[0];
+    int M1 = WideMask64[1];
+    if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
+      MVT HalfVT = VT0.getSimpleVT().getHalfNumVectorElementsVT();
+      unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
+      unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
+      SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
+      SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
+      SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
+      return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
+    }
+  }
+
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 29763134dd5f01..9b26f95f514782 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -639,9 +639,8 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readno
 define <8 x i16> @shuffle_combine_packusdw_permq_extract(<8 x i32> %a0) {
 ; CHECK-LABEL: shuffle_combine_packusdw_permq_extract:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpackusdw %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> poison)