[llvm] 4c973ae - [PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops
Qiu Chaofan via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 9 23:28:49 PST 2021
Author: Qiu Chaofan
Date: 2021-03-10T15:21:32+08:00
New Revision: 4c973ae51b859dca9792e1ad87a6673c49815a8d
URL: https://github.com/llvm/llvm-project/commit/4c973ae51b859dca9792e1ad87a6673c49815a8d
DIFF: https://github.com/llvm/llvm-project/commit/4c973ae51b859dca9792e1ad87a6673c49815a8d.diff
LOG: [PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops
This patch simplifies pattern (xxswap (vec-op (xxswap a) (xxswap b)))
into (vec-op a b) if vec-op is lane-insensitive. The motivating case
is ScalarToVector-VecOp-ExtractElement sequence on LE, but the
peephole itself is not related to endianness, so BE may also benefit
from this.
Reviewed By: nemanjai
Differential Revision: https://reviews.llvm.org/D97658
Added:
llvm/test/CodeGen/PowerPC/swap-reduction.ll
Modified:
llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 15771eef747c..a8aa97eca76c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6717,6 +6717,102 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
CurDAG->RemoveDeadNodes();
}
+static bool isVSXSwap(SDValue N) {
+ if (!N->isMachineOpcode())
+ return false;
+ unsigned Opc = N->getMachineOpcode();
+
+ // Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
+ // operand is 2.
+ if (Opc == PPC::XXPERMDIs) {
+ return isa<ConstantSDNode>(N->getOperand(1)) &&
+ N->getConstantOperandVal(1) == 2;
+ } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
+ return N->getOperand(0) == N->getOperand(1) &&
+ isa<ConstantSDNode>(N->getOperand(2)) &&
+ N->getConstantOperandVal(2) == 2;
+ }
+
+ return false;
+}
+
+// TODO: Make this complete and replace with a table-gen bit.
+static bool isLaneInsensitive(SDValue N) {
+ if (!N->isMachineOpcode())
+ return false;
+ unsigned Opc = N->getMachineOpcode();
+
+ switch (Opc) {
+ default:
+ return false;
+ case PPC::VAVGSB:
+ case PPC::VAVGUB:
+ case PPC::VAVGSH:
+ case PPC::VAVGUH:
+ case PPC::VAVGSW:
+ case PPC::VAVGUW:
+ case PPC::VMAXFP:
+ case PPC::VMAXSB:
+ case PPC::VMAXUB:
+ case PPC::VMAXSH:
+ case PPC::VMAXUH:
+ case PPC::VMAXSW:
+ case PPC::VMAXUW:
+ case PPC::VMINFP:
+ case PPC::VMINSB:
+ case PPC::VMINUB:
+ case PPC::VMINSH:
+ case PPC::VMINUH:
+ case PPC::VMINSW:
+ case PPC::VMINUW:
+ case PPC::VADDFP:
+ case PPC::VADDUBM:
+ case PPC::VADDUHM:
+ case PPC::VADDUWM:
+ case PPC::VSUBFP:
+ case PPC::VSUBUBM:
+ case PPC::VSUBUHM:
+ case PPC::VSUBUWM:
+ case PPC::VAND:
+ case PPC::VANDC:
+ case PPC::VOR:
+ case PPC::VORC:
+ case PPC::VXOR:
+ case PPC::VNOR:
+ case PPC::VMULUWM:
+ return true;
+ }
+}
+
+// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
+// lane-insensitive.
+static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
+ // Our desired xxswap might be source of COPY_TO_REGCLASS.
+ // TODO: Can we put this a common method for DAG?
+ auto SkipRCCopy = [](SDValue V) {
+ while (V->isMachineOpcode() &&
+ V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+ V = V->getOperand(0);
+ return V;
+ };
+
+ SDValue VecOp = SkipRCCopy(N->getOperand(0));
+ if (!isLaneInsensitive(VecOp))
+ return;
+
+ SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
+ RHS = SkipRCCopy(VecOp.getOperand(1));
+ if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) ||
+ !isVSXSwap(RHS))
+ return;
+
+ // These swaps may still have chain-uses here, count on dead code elimination
+ // in following passes to remove them.
+ DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
+ DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
+ DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
+}
+
void PPCDAGToDAGISel::PeepholePPC64() {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
@@ -6726,6 +6822,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
if (N->use_empty() || !N->isMachineOpcode())
continue;
+ if (isVSXSwap(SDValue(N, 0)))
+ reduceVSXSwap(N, CurDAG);
+
unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
bool RequiresMod4Offset = false;
diff --git a/llvm/test/CodeGen/PowerPC/swap-reduction.ll b/llvm/test/CodeGen/PowerPC/swap-reduction.ll
new file mode 100644
index 000000000000..228f89897518
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/swap-reduction.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s
+
+define i64 @test1(i64* %a, i64* %b) {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mr 5, 3
+; CHECK-NEXT: ld 3, 0(3)
+; CHECK-NEXT: ld 4, 0(4)
+; CHECK-NEXT: mtvsrd 34, 3
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: mtvsrd 35, 4
+; CHECK-NEXT: vavgsb 2, 2, 3
+; CHECK-NEXT: stxsdx 34, 0, 5
+; CHECK-NEXT: blr
+entry:
+ %lhs = load i64, i64* %a, align 8
+ %rhs = load i64, i64* %b, align 8
+ %sum = add i64 %lhs, %rhs
+ %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+ %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+ %lhc = bitcast <2 x i64> %lv to <16 x i8>
+ %rhc = bitcast <2 x i64> %rv to <16 x i8>
+ %add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc)
+ %cb = bitcast <16 x i8> %add to <2 x i64>
+ %fv = extractelement <2 x i64> %cb, i32 0
+ store i64 %fv, i64* %a, align 8
+ ret i64 %sum
+}
+
+define i64 @test2(i64* %a, i64* %b) {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: mr 5, 3
+; CHECK-NEXT: ld 3, 0(3)
+; CHECK-NEXT: ld 4, 0(4)
+; CHECK-NEXT: mtvsrd 34, 3
+; CHECK-NEXT: add 3, 3, 4
+; CHECK-NEXT: mtvsrd 35, 4
+; CHECK-NEXT: vadduhm 2, 2, 3
+; CHECK-NEXT: stxsdx 34, 0, 5
+; CHECK-NEXT: blr
+entry:
+ %lhs = load i64, i64* %a, align 8
+ %rhs = load i64, i64* %b, align 8
+ %sum = add i64 %lhs, %rhs
+ %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+ %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+ %lhc = bitcast <2 x i64> %lv to <8 x i16>
+ %rhc = bitcast <2 x i64> %rv to <8 x i16>
+ %add = add <8 x i16> %lhc, %rhc
+ %cb = bitcast <8 x i16> %add to <2 x i64>
+ %fv = extractelement <2 x i64> %cb, i32 0
+ store i64 %fv, i64* %a, align 8
+ ret i64 %sum
+}
+
+declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)
More information about the llvm-commits
mailing list