[llvm] [PPC]Optimize zeroing accumulator and spilling instructions into simple instructions (PR #96094)
zhijian lin via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 6 12:23:45 PDT 2024
https://github.com/diggerlin updated https://github.com/llvm/llvm-project/pull/96094
>From 26beee4aa0a87575db1946998739e0b3203f394b Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Tue, 18 Jun 2024 16:08:49 -0400
Subject: [PATCH 1/2] first commit of replace XXSETACCZ
---
.../lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 88 +++++++++++++++++++
llvm/test/CodeGen/PowerPC/mma-intrinsics.ll | 36 ++++----
2 files changed, 104 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d45edd74ab8544..4dac9d1708d8a1 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -109,6 +109,93 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
MachineFunctionProperties::Property::NoVRegs);
}
+ // The funtion will simply the zeroing accumulator and spilling instrcutions
+ // into simple xxlxor and spilling instrcuctions.
+ // From:
+ // setaccz acci
+ // xxmfacc acci
+ // stxv vsr(i*4+0), D(1)
+ // stxv vsr(i*4+1), D-16(1)
+ // stxv vsr(i*4+2), D-32(1)
+ // stxv vsr(i*4+3), D-48(1)
+
+ // To:
+ // xxlxor vsr(i*4), 0, 0
+ // stxv vsr(i*4), D(1)
+ // stxv vsr(i*4), D-16(1)
+ // stxv vsr(i*4), D-32(1)
+ // stxv vsr(i*4), D-48(1)
+ bool
+ OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
+ const TargetRegisterInfo *TRI) const {
+ bool changed = false;
+ for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+ if (BBI->getOpcode() != PPC::XXSETACCZ)
+ continue;
+
+ Register ACCZReg = BBI->getOperand(0).getReg();
+
+ DenseSet<MachineInstr *> InstrsToErase;
+ InstrsToErase.insert(&*BBI++);
+
+ if (BBI->getOpcode() != PPC::XXMFACC) {
+ --BBI;
+ continue;
+ }
+
+ Register ACCWReg = BBI->getOperand(0).getReg();
+
+ if (ACCWReg != ACCZReg)
+ continue;
+
+ auto XXMFACCInstr = BBI;
+ InstrsToErase.insert(&*BBI++);
+
+ Register VSLRegBase = (ACCWReg - PPC::ACC0) * 4 + PPC::VSL0;
+ bool isVSLRegBaseKilled = false;
+ for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+ if (BBI->getOpcode() == PPC::STXV) {
+ Register Reg0 = BBI->getOperand(0).getReg();
+ // If the VSLRegBase Register is killed, we put the kill in the
+ // last STXV instruction.
+ if (Reg0 == VSLRegBase && BBI->getOperand(0).isKill())
+ isVSLRegBaseKilled = true;
+ if (Reg0 < VSLRegBase || Reg0 > VSLRegBase + 3)
+ continue;
+ } else {
+ --BBI;
+ continue;
+ }
+ }
+
+ BBI = XXMFACCInstr;
+ BBI++;
+ for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+ Register VSLiReg = BBI->getOperand(0).getReg();
+ BBI->substituteRegister(VSLiReg, VSLRegBase, 0, *TRI);
+ BBI->getOperand(0).setIsKill(false);
+ }
+
+ if (isVSLRegBaseKilled)
+ (--BBI)->getOperand(0).setIsKill(true);
+
+ DebugLoc DL = XXMFACCInstr->getDebugLoc();
+ const PPCInstrInfo *TII = XXMFACCInstr->getMF()
+ ->getSubtarget<PPCSubtarget>()
+ .getInstrInfo();
+
+ BuildMI(MBB, &*XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
+ .addReg(VSLRegBase,RegState::Undef)
+ .addReg(VSLRegBase,RegState::Undef);
+
+ for (MachineInstr *MI : InstrsToErase)
+ MI->eraseFromParent();
+
+ changed |= true;
+ }
+ return changed;
+ }
+
// This function removes any redundant load immediates. It has two level
// loops - The outer loop finds the load immediates BBI that could be used
// to replace following redundancy. The inner loop scans instructions that
@@ -466,6 +553,7 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
Changed |= removeRedundantLIs(MBB, TRI);
Changed |= addLinkerOpt(MBB, TRI);
Changed |= removeAccPrimeUnprime(MBB);
+ Changed |= OptimizeZeroingAccumulatorSpilling(MBB, TRI);
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
if (Opc == PPC::UNENCODED_NOP) {
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 53b0a2737122e1..17e24eefc25805 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -115,22 +115,20 @@ declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
define void @int_xxsetaccz(ptr %ptr) {
; CHECK-LABEL: int_xxsetaccz:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxsetaccz acc0
-; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: xxlxor vs0, vs0, vs0
; CHECK-NEXT: stxv vs0, 48(r3)
-; CHECK-NEXT: stxv vs1, 32(r3)
-; CHECK-NEXT: stxv vs2, 16(r3)
-; CHECK-NEXT: stxv vs3, 0(r3)
+; CHECK-NEXT: stxv vs0, 32(r3)
+; CHECK-NEXT: stxv vs0, 16(r3)
+; CHECK-NEXT: stxv vs0, 0(r3)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: int_xxsetaccz:
; CHECK-BE: # %bb.0: # %entry
-; CHECK-BE-NEXT: xxsetaccz acc0
-; CHECK-BE-NEXT: xxmfacc acc0
-; CHECK-BE-NEXT: stxv vs1, 16(r3)
+; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
+; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: stxv vs0, 0(r3)
-; CHECK-BE-NEXT: stxv vs3, 48(r3)
-; CHECK-BE-NEXT: stxv vs2, 32(r3)
+; CHECK-BE-NEXT: stxv vs0, 48(r3)
+; CHECK-BE-NEXT: stxv vs0, 32(r3)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -143,22 +141,20 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble
define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
; CHECK-LABEL: disass_acc:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xxsetaccz acc0
-; CHECK-NEXT: xxmfacc acc0
-; CHECK-NEXT: stxv vs3, 0(r3)
-; CHECK-NEXT: stxv vs2, 0(r4)
-; CHECK-NEXT: stxv vs1, 0(r5)
+; CHECK-NEXT: xxlxor vs0, vs0, vs0
+; CHECK-NEXT: stxv vs0, 0(r3)
+; CHECK-NEXT: stxv vs0, 0(r4)
+; CHECK-NEXT: stxv vs0, 0(r5)
; CHECK-NEXT: stxv vs0, 0(r6)
; CHECK-NEXT: blr
;
; CHECK-BE-LABEL: disass_acc:
; CHECK-BE: # %bb.0: # %entry
-; CHECK-BE-NEXT: xxsetaccz acc0
-; CHECK-BE-NEXT: xxmfacc acc0
+; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
; CHECK-BE-NEXT: stxv vs0, 0(r3)
-; CHECK-BE-NEXT: stxv vs1, 0(r4)
-; CHECK-BE-NEXT: stxv vs2, 0(r5)
-; CHECK-BE-NEXT: stxv vs3, 0(r6)
+; CHECK-BE-NEXT: stxv vs0, 0(r4)
+; CHECK-BE-NEXT: stxv vs0, 0(r5)
+; CHECK-BE-NEXT: stxv vs0, 0(r6)
; CHECK-BE-NEXT: blr
entry:
%0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
>From 2f8614ac27522e894144751a6fb9c855d8a37a50 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Fri, 9 Aug 2024 15:38:33 -0400
Subject: [PATCH 2/2] deal with not adjacent instruction
---
.../lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 162 ++++++++++++------
llvm/test/CodeGen/PowerPC/mma-intrinsics.ll | 18 +-
2 files changed, 117 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 4dac9d1708d8a1..06bcbe15465a65 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -114,86 +114,142 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
// From:
// setaccz acci
// xxmfacc acci
- // stxv vsr(i*4+0), D(1)
- // stxv vsr(i*4+1), D-16(1)
- // stxv vsr(i*4+2), D-32(1)
- // stxv vsr(i*4+3), D-48(1)
+ // stxv vsr(i*4+0), D(Base)
+ // stxv vsr(i*4+1), D-16(Base)
+ // stxv vsr(i*4+2), D-32(Base)
+ // stxv vsr(i*4+3), D-48(Base)
// To:
// xxlxor vsr(i*4), 0, 0
- // stxv vsr(i*4), D(1)
- // stxv vsr(i*4), D-16(1)
- // stxv vsr(i*4), D-32(1)
- // stxv vsr(i*4), D-48(1)
+ // stxv vsr(i*4), D(Base)
+ // stxv vsr(i*4), D-16(Base)
+ // stxv vsr(i*4), D-32(Base)
+ // stxv vsr(i*4), D-48(Base)
bool
OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
const TargetRegisterInfo *TRI) const {
- bool changed = false;
+ bool Changed = false;
+ DenseSet<MachineInstr *> InstrsToErase;
for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
if (BBI->getOpcode() != PPC::XXSETACCZ)
continue;
-
Register ACCZReg = BBI->getOperand(0).getReg();
+ MachineInstr *XXSETACCZInstr = nullptr;
+ MachineInstr *XXMFACCInstr = nullptr;
+ auto STXVInstrIter = MBB.begin();
+ bool isVSLRegBaseKilled = false;
+ Register VSLRegBase;
+
+ XXSETACCZInstr = &*BBI++;
+ for (auto TBBI = BBI; TBBI != MBB.instr_end(); ++TBBI) {
+ if (!XXMFACCInstr) {
+ if (TBBI->getOpcode() != PPC::XXMFACC) {
+ // Check whether the accumulator is redefined between XXSETACCZ
+ // and XXMFACC. we will not optimize them.
+ bool IsACCZRegRedefined = false;
+ for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
+ MachineOperand &Operand = TBBI->getOperand(i);
+ if (!Operand.isReg())
+ continue;
+ Register OperandReg = Operand.getReg();
+ // Check whether the accumulator `ACCZReg` is redefined.
+ if (OperandReg == ACCZReg && Operand.isDef())
+ IsACCZRegRedefined = true;
+ }
+ // If the ACCZReg is redefined, not check whether the `XXSETACCZ`
+ // has a corresponding `XXMFACC` any more.
+ if (IsACCZRegRedefined)
+ break;
- DenseSet<MachineInstr *> InstrsToErase;
- InstrsToErase.insert(&*BBI++);
-
- if (BBI->getOpcode() != PPC::XXMFACC) {
- --BBI;
- continue;
- }
-
- Register ACCWReg = BBI->getOperand(0).getReg();
-
- if (ACCWReg != ACCZReg)
- continue;
+ continue;
+ } else {
+ // Check if XXSETACCZ uses the same accumulator as the `XXMFACC`
+ // instruction.
+ if (TBBI->getOperand(0).getReg() != ACCZReg)
+ continue;
+ }
- auto XXMFACCInstr = BBI;
- InstrsToErase.insert(&*BBI++);
+ XXMFACCInstr = &*TBBI++;
+ VSLRegBase = (ACCZReg - PPC::ACC0) * 4 + PPC::VSL0;
+ }
- Register VSLRegBase = (ACCWReg - PPC::ACC0) * 4 + PPC::VSL0;
- bool isVSLRegBaseKilled = false;
- for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
- if (BBI->getOpcode() == PPC::STXV) {
- Register Reg0 = BBI->getOperand(0).getReg();
- // If the VSLRegBase Register is killed, we put the kill in the
- // last STXV instruction.
- if (Reg0 == VSLRegBase && BBI->getOperand(0).isKill())
- isVSLRegBaseKilled = true;
- if (Reg0 < VSLRegBase || Reg0 > VSLRegBase + 3)
- continue;
+ // Check whether it is a PPC::STXV instruction.
+ if (TBBI->getOpcode() != PPC::STXV) {
+ bool isVSLRedefinedOrUsed = false;
+ // Check whether the VSL register mapped to ACCWReg is redefined or
+ // used by non-STXV instructions.
+ for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
+ MachineOperand &Operand = TBBI->getOperand(i);
+ if (!Operand.isReg())
+ continue;
+ Register OperandReg = Operand.getReg();
+ Register VSRpBase = (ACCZReg - PPC::ACC0) * 2 + PPC::VSRp0;
+ if ((OperandReg >= VSLRegBase && OperandReg <= VSLRegBase + 3) ||
+ (OperandReg > VSRpBase && OperandReg <= VSRpBase + 1)) {
+ isVSLRedefinedOrUsed = true;
+ break;
+ }
+ }
+ // If the VSL register mapped to ACCWReg is redefined or used by a
+ // non-STXV instruction, we will not perform the optimization.
+ if (isVSLRedefinedOrUsed) {
+ XXMFACCInstr = nullptr;
+ break;
+ }
} else {
- --BBI;
- continue;
- }
+ // Check whether there are four STXV instructions continuously.
+ STXVInstrIter = TBBI;
+ for (unsigned InstrCount = 0; InstrCount < 4;
+ ++InstrCount, ++TBBI) {
+ if (TBBI->getOpcode() == PPC::STXV) {
+ Register Reg0 = TBBI->getOperand(0).getReg();
+ // If the VSLRegBase Register is killed, we put the kill in the
+ // last STXV instruction.
+ // FIXME: We may need to update killed flag for other vsr as
+ // well.
+ if (Reg0 == VSLRegBase && TBBI->getOperand(0).isKill())
+ isVSLRegBaseKilled = true;
+ if (Reg0 >= VSLRegBase && Reg0 <= VSLRegBase + 3)
+ continue;
+ // The register operand of the STXV instruction is not a VSL
+ // register mapped to ACCWReg.
+ XXMFACCInstr = nullptr;
+ break;
+ }
+ }
}
+ // There are four consecutive STXV instructions.
+ break;
+ }
- BBI = XXMFACCInstr;
- BBI++;
- for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
- Register VSLiReg = BBI->getOperand(0).getReg();
- BBI->substituteRegister(VSLiReg, VSLRegBase, 0, *TRI);
- BBI->getOperand(0).setIsKill(false);
+ if (XXMFACCInstr && STXVInstrIter != MBB.begin()) {
+ for (unsigned InstrCount = 0; InstrCount < 4;
+ ++InstrCount, ++STXVInstrIter) {
+ Register VSLReg = STXVInstrIter->getOperand(0).getReg();
+ STXVInstrIter->substituteRegister(VSLReg, VSLRegBase, 0, *TRI);
+ STXVInstrIter->getOperand(0).setIsKill(false);
}
if (isVSLRegBaseKilled)
- (--BBI)->getOperand(0).setIsKill(true);
+ (--STXVInstrIter)->getOperand(0).setIsKill(true);
DebugLoc DL = XXMFACCInstr->getDebugLoc();
const PPCInstrInfo *TII = XXMFACCInstr->getMF()
->getSubtarget<PPCSubtarget>()
.getInstrInfo();
- BuildMI(MBB, &*XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
- .addReg(VSLRegBase,RegState::Undef)
- .addReg(VSLRegBase,RegState::Undef);
-
- for (MachineInstr *MI : InstrsToErase)
- MI->eraseFromParent();
+ BuildMI(MBB, XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
+ .addReg(VSLRegBase, RegState::Undef)
+ .addReg(VSLRegBase, RegState::Undef);
- changed |= true;
+ InstrsToErase.insert(XXSETACCZInstr);
+ InstrsToErase.insert(XXMFACCInstr);
+ Changed |= true;
}
- return changed;
+ }
+ for (MachineInstr *MI : InstrsToErase)
+ MI->eraseFromParent();
+ return Changed;
}
// This function removes any redundant load immediates. It has two level
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 17e24eefc25805..e01b8dc07af5d9 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -536,14 +536,13 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r0, 16(r1)
; CHECK-NEXT: stdu r1, -112(r1)
-; CHECK-NEXT: xxsetaccz acc0
; CHECK-NEXT: xxsetaccz acc1
; CHECK-NEXT: mr r30, r3
-; CHECK-NEXT: xxmfacc acc0
+; CHECK-NEXT: xxlxor vs0, vs0, vs0
; CHECK-NEXT: stxv vs0, 48(r3)
-; CHECK-NEXT: stxv vs1, 32(r3)
-; CHECK-NEXT: stxv vs2, 16(r3)
-; CHECK-NEXT: stxv vs3, 0(r3)
+; CHECK-NEXT: stxv vs0, 32(r3)
+; CHECK-NEXT: stxv vs0, 16(r3)
+; CHECK-NEXT: stxv vs0, 0(r3)
; CHECK-NEXT: xvf32gerpp acc1, v2, v2
; CHECK-NEXT: xxmfacc acc1
; CHECK-NEXT: stxv vs4, 80(r1)
@@ -568,15 +567,14 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
; CHECK-BE-NEXT: mflr r0
; CHECK-BE-NEXT: std r0, 16(r1)
; CHECK-BE-NEXT: stdu r1, -192(r1)
-; CHECK-BE-NEXT: xxsetaccz acc0
; CHECK-BE-NEXT: xxsetaccz acc1
; CHECK-BE-NEXT: std r30, 176(r1) # 8-byte Folded Spill
; CHECK-BE-NEXT: mr r30, r3
-; CHECK-BE-NEXT: xxmfacc acc0
-; CHECK-BE-NEXT: stxv vs1, 16(r3)
+; CHECK-BE-NEXT: xxlxor vs0, vs0, vs0
+; CHECK-BE-NEXT: stxv vs0, 16(r3)
; CHECK-BE-NEXT: stxv vs0, 0(r3)
-; CHECK-BE-NEXT: stxv vs3, 48(r3)
-; CHECK-BE-NEXT: stxv vs2, 32(r3)
+; CHECK-BE-NEXT: stxv vs0, 48(r3)
+; CHECK-BE-NEXT: stxv vs0, 32(r3)
; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
; CHECK-BE-NEXT: xxmfacc acc1
; CHECK-BE-NEXT: stxv vs4, 112(r1)
More information about the llvm-commits
mailing list