[llvm] 967d953 - [X86] lowerShuffleAsVALIGN - extend to recognize basic shifted element masks

Wed Aug 30 10:34:07 PDT 2023

Author: Simon Pilgrim
Date: 2023-08-30T18:32:55+01:00
New Revision: 967d95382d0eac27f1a1387cf2640cdd48bca03e

URL: https://github.com/llvm/llvm-project/commit/967d95382d0eac27f1a1387cf2640cdd48bca03e
DIFF: https://github.com/llvm/llvm-project/commit/967d95382d0eac27f1a1387cf2640cdd48bca03e.diff

LOG: [X86] lowerShuffleAsVALIGN - extend to recognize basic shifted element masks

Try to use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/oddsubvector.ll
    llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b0da91f4ada0ca..6ac9391bcb58c8 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11214,6 +11214,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
 /// rotate* of the vector lanes.
 static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
                                     SDValue V2, ArrayRef<int> Mask,
+                                    const APInt &Zeroable,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
@@ -11225,11 +11226,40 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
 
   SDValue Lo = V1, Hi = V2;
   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
-  if (Rotation <= 0)
+  if (0 < Rotation)
+    return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+                       DAG.getTargetConstant(Rotation, DL, MVT::i8));
+
+  // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
+  // TODO: Pull this out as a matchShuffleAsElementShift helper?
+  // TODO: We can probably make this more aggressive and use shift-pairs like
+  // lowerShuffleAsByteShiftMask.
+  unsigned NumElts = Mask.size();
+  unsigned ZeroLo = Zeroable.countr_one();
+  unsigned ZeroHi = Zeroable.countl_one();
+  assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
+  if (!ZeroLo && !ZeroHi)
     return SDValue();
 
-  return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
-                     DAG.getTargetConstant(Rotation, DL, MVT::i8));
+  if (ZeroLo) {
+    SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
+    int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
+    if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
+      return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
+                         getZeroVector(VT, Subtarget, DAG, DL),
+                         DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
+  }
+
+  if (ZeroHi) {
+    SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
+    int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
+    if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
+      return DAG.getNode(X86ISD::VALIGN, DL, VT,
+                         getZeroVector(VT, Subtarget, DAG, DL), Src,
+                         DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
+  }
+
+  return SDValue();
 }
 
 /// Try to lower a vector shuffle as a byte shift sequence.
@@ -12625,7 +12655,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
-                                                Subtarget, DAG))
+                                                Zeroable, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
@@ -12940,7 +12970,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
-                                                Subtarget, DAG))
+                                                Zeroable, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
@@ -15674,7 +15704,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have VLX support, we can use VALIGN or VEXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
-                                              Subtarget, DAG))
+                                              Zeroable, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
@@ -15936,7 +15966,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have VLX support, we can use VALIGN or EXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
-                                              Subtarget, DAG))
+                                              Zeroable, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
@@ -16585,7 +16615,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
-                                            Subtarget, DAG))
+                                            Zeroable, Subtarget, DAG))
     return Rotate;
 
   // Try to use PALIGNR.
@@ -16669,7 +16699,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
-                                            Subtarget, DAG))
+                                            Zeroable, Subtarget, DAG))
     return Rotate;
 
   // Try to use byte rotation instructions.

diff  --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index ce0e1048ed048a..f57defc368d5ef 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -139,9 +139,8 @@ define <16 x i32> @PR42819(ptr %a0) {
 ; AVX512-LABEL: PR42819:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512-NEXT:    movw $-8192, %ax # imm = 0xE000
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    valignd {{.*#+}} zmm0 = zmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2]
 ; AVX512-NEXT:    retq
   %1 = load <8 x i32>, ptr %a0, align 4
   %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 6069b3f834d71d..2bb7dd90cc05f6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1665,19 +1665,11 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
 }
 
 define <8 x i64> @shuffle_v8i64_34567zzz(<8 x i64> %a, <8 x i64> %b) {
-; AVX512F-LABEL: shuffle_v8i64_34567zzz:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,4,5,6,7,13,14,15]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512F-32-LABEL: shuffle_v8i64_34567zzz:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,0,4,0,5,0,6,0,7,0,13,0,14,0,15,0]
-; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; ALL-LABEL: shuffle_v8i64_34567zzz:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7],zmm1[0,1,2]
+; ALL-NEXT:    ret{{[l|q]}}
   %valign = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8>
   ret <8 x i64> %valign
 }
@@ -1685,9 +1677,8 @@ define <8 x i64> @shuffle_v8i64_34567zzz(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_zz012345(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_zz012345:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movb $-4, %al
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    valignq {{.*#+}} zmm0 = zmm1[6,7],zmm0[0,1,2,3,4,5]
 ; ALL-NEXT:    ret{{[l|q]}}
   %valign = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i64> %valign