[llvm] c13e3e2 - [PowerPC][Power10] Exploit the xxsplti32dx instruction when lowering VECTOR_SHUFFLE.

Mon Jul 6 18:29:05 PDT 2020

Author: Amy Kwan
Date: 2020-07-06T20:28:38-05:00
New Revision: c13e3e2c2e0c774917bcc7f4f50c29c8133d3a55

URL: https://github.com/llvm/llvm-project/commit/c13e3e2c2e0c774917bcc7f4f50c29c8133d3a55
DIFF: https://github.com/llvm/llvm-project/commit/c13e3e2c2e0c774917bcc7f4f50c29c8133d3a55.diff

LOG: [PowerPC][Power10] Exploit the xxsplti32dx instruction when lowering VECTOR_SHUFFLE.

This patch aims to exploit the xxsplti32dx XT, IX, IMM32 instruction when lowering VECTOR_SHUFFLEs.
We implement lowerToXXSPLTI32DX when lowering vector shuffles to check if:
- Element size is 4 bytes
- The RHS is a constant vector (and constant splat of 4-bytes)
- The shuffle mask is a suitable mask for the XXSPLTI32DX instruction where it is one of the 32 masks:
<0, 4-7, 2, 4-7>
<4-7, 1, 4-7, 3>

Differential Revision: https://reviews.llvm.org/D83245

Added: 
    llvm/test/CodeGen/PowerPC/p10-splatImm32.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/lib/Target/PowerPC/PPCInstrPrefix.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 40619519664f..815a84e8c320 100644

--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1477,6 +1477,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
   case PPCISD::XXSPLTI_SP_TO_DP:
     return "PPCISD::XXSPLTI_SP_TO_DP";
+  case PPCISD::XXSPLTI32DX:
+    return "PPCISD::XXSPLTI32DX";
   case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
   case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
@@ -9778,6 +9780,77 @@ SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
 }
 
+/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
+/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
+/// return the default SDValue.
+SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
+                                              SelectionDAG &DAG) const {
+  // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
+  // to v16i8. Peek through the bitcasts to get the actual operands.
+  SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
+  SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
+
+  auto ShuffleMask = SVN->getMask();
+  SDValue VecShuffle(SVN, 0);
+  SDLoc DL(SVN);
+
+  // Check that we have a four byte shuffle.
+  if (!isNByteElemShuffleMask(SVN, 4, 1))
+    return SDValue();
+
+  // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
+  if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
+    std::swap(LHS, RHS);
+    VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
+    ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
+  }
+
+  // Ensure that the RHS is a vector of constants.
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  if (!BVN)
+    return SDValue();
+
+  // Check if RHS is a splat of 4-bytes (or smaller).
+  APInt APSplatValue, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
+      SplatBitSize > 32)
+    return SDValue();
+
+  // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
+  // The instruction splats a constant C into two words of the source vector
+  // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
+  // Thus we check that the shuffle mask is the equivalent  of
+  // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
+  // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
+  // within each word are consecutive, so we only need to check the first byte.
+  SDValue Index;
+  bool IsLE = Subtarget.isLittleEndian();
+  if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
+      (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
+       ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
+    Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
+  else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
+           (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
+            ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
+    Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
+  else
+    return SDValue();
+
+  // If the splat is narrower than 32-bits, we need to get the 32-bit value
+  // for XXSPLTI32DX.
+  unsigned SplatVal = APSplatValue.getZExtValue();
+  for (; SplatBitSize < 32; SplatBitSize <<= 1)
+    SplatVal |= (SplatVal << SplatBitSize);
+
+  SDValue SplatNode = DAG.getNode(
+      PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
+      Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
+}
+
 /// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
 /// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
 /// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
@@ -9895,6 +9968,12 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+  if (Subtarget.hasPrefixInstrs()) {
+    SDValue SplatInsertNode;
+    if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
+      return SplatInsertNode;
+  }
+
   if (Subtarget.hasP9Altivec()) {
     SDValue NewISDNode;
     if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 98256ae0c359..768eaa43e013 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -102,6 +102,10 @@ namespace llvm {
     /// vector or scalar.
     XXSPLTI_SP_TO_DP,
 
+    /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
+    ///
+    XXSPLTI32DX,
+
     /// VECINSERT - The PPC vector insert instruction
     ///
     VECINSERT,
@@ -1270,6 +1274,10 @@ namespace llvm {
     /// essentially v16i8 vector version of VINSERTH.
     SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
 
+    /// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the XXSPLTI32DX instruction introduced in ISA 3.1.
+    SDValue lowerToXXSPLTI32DX(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
     // Return whether the call instruction can potentially be optimized to a
     // tail call. This will cause the optimizers to attempt to move, or
     // duplicate return instructions to help enable tail call optimizations.

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 818c4e839e0a..3ed651abe453 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1,3 +1,19 @@
+//===----------------------------------------------------------------------===//
+// PowerPC ISA 3.1 specific type constraints.
+//
+
+def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
+  SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ISA 3.1 specific PPCISD nodes.
+//
+
+def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+
+//===----------------------------------------------------------------------===//
+
 // PC Relative flag (for instructions that use the address of the prefix for
 // address computations).
 class isPCRel { bit PCRel = 1; }
@@ -732,8 +748,11 @@ let Predicates = [PrefixInstrs] in {
                                             (PPCxxspltidp i32:$IMM32))]>;
   def XXSPLTI32DX :
       8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
-                             (ins vsrc:$XTi, i1imm:$IX, i32imm:$IMM32),
-                             "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral, []>,
+                             (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
+                             "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
+                             [(set v2i64:$XT,
+                                   (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
+                                                   i32:$IMM32))]>,
                              RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
   def XXPERMX :
     8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,

diff  --git a/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
new file mode 100644
index 000000000000..d610bd260fc9
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-splatImm32.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
+; RUN:     FileCheck --check-prefix=CHECK-LE %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr10 < %s | \
+; RUN:     FileCheck --check-prefix=CHECK-BE %s
+
+; Function Attrs: norecurse nounwind readnone
+define  <4 x i32> @test_xxsplti32dx_1(<4 x i32> %a) {
+; CHECK-LE-LABEL: test_xxsplti32dx_1:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 0, 566
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_1:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 1, 566
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins1 = shufflevector <4 x i32> %a, <4 x i32> <i32 undef, i32 566, i32 undef, i32 566>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define  <4 x i32> @test_xxsplti32dx_2(<4 x i32> %a) {
+; CHECK-LE-LABEL: test_xxsplti32dx_2:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, 33
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_2:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 0, 33
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins1 = shufflevector <4 x i32> <i32 33, i32 undef, i32 33, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define  <4 x i32> @test_xxsplti32dx_3(<4 x i32> %a) {
+; CHECK-LE-LABEL: test_xxsplti32dx_3:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 0, 12
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_3:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 1, 12
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins1 = shufflevector <4 x i32> %a, <4 x i32> <i32 undef, i32 12, i32 undef, i32 12>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define  <4 x i32> @test_xxsplti32dx_4(<4 x i32> %a) {
+; CHECK-LE-LABEL: test_xxsplti32dx_4:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, -683
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_4:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 0, -683
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins1 = shufflevector <4 x i32> <i32 -683, i32 undef, i32 -683, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins1
+}
+
+; Function Attrs: nounwind
+define  <4 x float> @test_xxsplti32dx_5(<4 x float> %vfa) {
+; CHECK-LE-LABEL: test_xxsplti32dx_5:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 0, 1065353216
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_5:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 1, 1065353216
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins3.i = shufflevector <4 x float> %vfa, <4 x float> <float undef, float 1.000000e+00, float undef, float 1.000000e+00>, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecins3.i
+}
+
+; Function Attrs: nounwind
+define  <4 x float> @test_xxsplti32dx_6(<4 x float> %vfa) {
+; CHECK-LE-LABEL: test_xxsplti32dx_6:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, 1073741824
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_6:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 0, 1073741824
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins3.i = shufflevector <4 x float> <float 2.000000e+00, float undef, float 2.000000e+00, float undef>, <4 x float> %vfa, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %vecins3.i
+}
+
+; Function Attrs: norecurse nounwind readnone
+; Test to illustrate when the splat is narrower than 32-bits.
+define dso_local <4 x i32> @test_xxsplti32dx_7(<4 x i32> %a) local_unnamed_addr #0 {
+; CHECK-LE-LABEL: test_xxsplti32dx_7:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    xxsplti32dx vs34, 1, -1414812757
+; CHECK-LE-NEXT:    blr
+;
+; CHECK-BE-LABEL: test_xxsplti32dx_7:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    xxsplti32dx vs34, 0, -1414812757
+; CHECK-BE-NEXT:    blr
+entry:
+  %vecins1 = shufflevector <4 x i32> <i32 -1414812757, i32 undef, i32 -1414812757, i32 undef>, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %vecins1
+}