[llvm-branch-commits] [llvm] 4214ca9 - [X86][AVX] Attempt to fold vpermf128(op(x, i), op(y, i)) -> op(vpermf128(x, y), i)

Mon Jan 11 09:13:25 PST 2021

Author: Simon Pilgrim
Date: 2021-01-11T16:59:25Z
New Revision: 4214ca96145c9487407925b121b85fafb1179209

URL: https://github.com/llvm/llvm-project/commit/4214ca96145c9487407925b121b85fafb1179209
DIFF: https://github.com/llvm/llvm-project/commit/4214ca96145c9487407925b121b85fafb1179209.diff

LOG: [X86][AVX] Attempt to fold vpermf128(op(x,i),op(y,i)) -> op(vpermf128(x,y),i)

If vpermf128/vpermi128 is acting on 2 similar 'inlane' ops, then try to perform the vpermf128 first which will allow us to merge the ops.

This will help us fix one of the regressions in D56387

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-trunc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7895f883863f..2f9de876a87f 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36665,6 +36665,43 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
   return SDValue();
 }
 
+/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
+static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
+                                                      SelectionDAG &DAG,
+                                                      const SDLoc &DL) {
+  assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
+
+  MVT VT = V.getSimpleValueType();
+  SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
+  SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
+  unsigned SrcOpc0 = Src0.getOpcode();
+  unsigned SrcOpc1 = Src1.getOpcode();
+  EVT SrcVT0 = Src0.getValueType();
+  EVT SrcVT1 = Src1.getValueType();
+
+  // TODO: Under what circumstances should we push perm2f128 up when we have one
+  // active src?
+  if (SrcOpc0 != SrcOpc1 || SrcVT0 != SrcVT1)
+    return SDValue();
+
+  switch (SrcOpc0) {
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI:
+    if (Src0.getOperand(1) == Src1.getOperand(1)) {
+      SDValue Res = DAG.getNode(
+          X86ISD::VPERM2X128, DL, VT, DAG.getBitcast(VT, Src0.getOperand(0)),
+          DAG.getBitcast(VT, Src1.getOperand(0)), V.getOperand(2));
+      Res = DAG.getNode(SrcOpc0, DL, SrcVT0, DAG.getBitcast(SrcVT0, Res),
+                        Src0.getOperand(1));
+      return DAG.getBitcast(VT, Res);
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
@@ -37045,6 +37082,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     return SDValue();
   }
   case X86ISD::VPERM2X128: {
+    if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
+        return Res;
+
     // If both 128-bit values were inserted into high halves of 256-bit values,
     // the shuffle can be reduced to a concatenation of subvectors:
     // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
@@ -37053,6 +37093,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
     SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
     unsigned Imm = N.getConstantOperandVal(2);
+
     if (!(Imm == 0x31 &&
           Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
           Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&

diff  --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index bd8b7dd355cc..f35e315bbb0b 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -107,11 +107,9 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
 ;
 ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
 ; AVX2-SLOW:       # %bb.0: # %entry
-; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
+; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
+; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: