[llvm] 4c973ae - [PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops

Tue Mar 9 23:28:49 PST 2021

Author: Qiu Chaofan
Date: 2021-03-10T15:21:32+08:00
New Revision: 4c973ae51b859dca9792e1ad87a6673c49815a8d

URL: https://github.com/llvm/llvm-project/commit/4c973ae51b859dca9792e1ad87a6673c49815a8d
DIFF: https://github.com/llvm/llvm-project/commit/4c973ae51b859dca9792e1ad87a6673c49815a8d.diff

LOG: [PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops

This patch simplifies pattern (xxswap (vec-op (xxswap a) (xxswap b)))
into (vec-op a b) if vec-op is lane-insensitive. The motivating case
is ScalarToVector-VecOp-ExtractElement sequence on LE, but the
peephole itself is not related to endianness, so BE may also benefit
from this.

Reviewed By: nemanjai

Differential Revision: https://reviews.llvm.org/D97658

Added: 
    llvm/test/CodeGen/PowerPC/swap-reduction.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 15771eef747c..a8aa97eca76c 100644

--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6717,6 +6717,102 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
     CurDAG->RemoveDeadNodes();
 }
 
+static bool isVSXSwap(SDValue N) {
+  if (!N->isMachineOpcode())
+    return false;
+  unsigned Opc = N->getMachineOpcode();
+
+  // Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
+  // operand is 2.
+  if (Opc == PPC::XXPERMDIs) {
+    return isa<ConstantSDNode>(N->getOperand(1)) &&
+           N->getConstantOperandVal(1) == 2;
+  } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
+    return N->getOperand(0) == N->getOperand(1) &&
+           isa<ConstantSDNode>(N->getOperand(2)) &&
+           N->getConstantOperandVal(2) == 2;
+  }
+
+  return false;
+}
+
+// TODO: Make this complete and replace with a table-gen bit.
+static bool isLaneInsensitive(SDValue N) {
+  if (!N->isMachineOpcode())
+    return false;
+  unsigned Opc = N->getMachineOpcode();
+
+  switch (Opc) {
+  default:
+    return false;
+  case PPC::VAVGSB:
+  case PPC::VAVGUB:
+  case PPC::VAVGSH:
+  case PPC::VAVGUH:
+  case PPC::VAVGSW:
+  case PPC::VAVGUW:
+  case PPC::VMAXFP:
+  case PPC::VMAXSB:
+  case PPC::VMAXUB:
+  case PPC::VMAXSH:
+  case PPC::VMAXUH:
+  case PPC::VMAXSW:
+  case PPC::VMAXUW:
+  case PPC::VMINFP:
+  case PPC::VMINSB:
+  case PPC::VMINUB:
+  case PPC::VMINSH:
+  case PPC::VMINUH:
+  case PPC::VMINSW:
+  case PPC::VMINUW:
+  case PPC::VADDFP:
+  case PPC::VADDUBM:
+  case PPC::VADDUHM:
+  case PPC::VADDUWM:
+  case PPC::VSUBFP:
+  case PPC::VSUBUBM:
+  case PPC::VSUBUHM:
+  case PPC::VSUBUWM:
+  case PPC::VAND:
+  case PPC::VANDC:
+  case PPC::VOR:
+  case PPC::VORC:
+  case PPC::VXOR:
+  case PPC::VNOR:
+  case PPC::VMULUWM:
+    return true;
+  }
+}
+
+// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
+// lane-insensitive.
+static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
+  // Our desired xxswap might be source of COPY_TO_REGCLASS.
+  // TODO: Can we put this a common method for DAG?
+  auto SkipRCCopy = [](SDValue V) {
+    while (V->isMachineOpcode() &&
+           V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+      V = V->getOperand(0);
+    return V;
+  };
+
+  SDValue VecOp = SkipRCCopy(N->getOperand(0));
+  if (!isLaneInsensitive(VecOp))
+    return;
+
+  SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
+          RHS = SkipRCCopy(VecOp.getOperand(1));
+  if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) ||
+      !isVSXSwap(RHS))
+    return;
+
+  // These swaps may still have chain-uses here, count on dead code elimination
+  // in following passes to remove them.
+  DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
+  DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
+  DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
+}
+
 void PPCDAGToDAGISel::PeepholePPC64() {
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
@@ -6726,6 +6822,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
 
+    if (isVSXSwap(SDValue(N, 0)))
+      reduceVSXSwap(N, CurDAG);
+
     unsigned FirstOp;
     unsigned StorageOpcode = N->getMachineOpcode();
     bool RequiresMod4Offset = false;

diff  --git a/llvm/test/CodeGen/PowerPC/swap-reduction.ll b/llvm/test/CodeGen/PowerPC/swap-reduction.ll
new file mode 100644
index 000000000000..228f89897518
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/swap-reduction.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s
+
+define i64 @test1(i64* %a, i64* %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    mtvsrd 34, 3
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    mtvsrd 35, 4
+; CHECK-NEXT:    vavgsb 2, 2, 3
+; CHECK-NEXT:    stxsdx 34, 0, 5
+; CHECK-NEXT:    blr
+entry:
+  %lhs = load i64, i64* %a, align 8
+  %rhs = load i64, i64* %b, align 8
+  %sum = add i64 %lhs, %rhs
+  %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+  %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+  %lhc = bitcast <2 x i64> %lv to <16 x i8>
+  %rhc = bitcast <2 x i64> %rv to <16 x i8>
+  %add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc)
+  %cb = bitcast <16 x i8> %add to <2 x i64>
+  %fv = extractelement <2 x i64> %cb, i32 0
+  store i64 %fv, i64* %a, align 8
+  ret i64 %sum
+}
+
+define i64 @test2(i64* %a, i64* %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    mtvsrd 34, 3
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    mtvsrd 35, 4
+; CHECK-NEXT:    vadduhm 2, 2, 3
+; CHECK-NEXT:    stxsdx 34, 0, 5
+; CHECK-NEXT:    blr
+entry:
+  %lhs = load i64, i64* %a, align 8
+  %rhs = load i64, i64* %b, align 8
+  %sum = add i64 %lhs, %rhs
+  %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+  %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+  %lhc = bitcast <2 x i64> %lv to <8 x i16>
+  %rhc = bitcast <2 x i64> %rv to <8 x i16>
+  %add = add <8 x i16> %lhc, %rhc
+  %cb = bitcast <8 x i16> %add to <2 x i64>
+  %fv = extractelement <2 x i64> %cb, i32 0
+  store i64 %fv, i64* %a, align 8
+  ret i64 %sum
+}
+
+declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)