[llvm] [PPC]Optimize zeroing accumulator and spilling instructions into simple instructions (PR #96094)

zhijian lin via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 6 12:23:45 PDT 2024


https://github.com/diggerlin updated https://github.com/llvm/llvm-project/pull/96094

>From 26beee4aa0a87575db1946998739e0b3203f394b Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Tue, 18 Jun 2024 16:08:49 -0400
Subject: [PATCH 1/2] first commit of replace XXSETACCZ

---
 .../lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 88 +++++++++++++++++++
 llvm/test/CodeGen/PowerPC/mma-intrinsics.ll   | 36 ++++----
 2 files changed, 104 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index d45edd74ab8544..4dac9d1708d8a1 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -109,6 +109,93 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
           MachineFunctionProperties::Property::NoVRegs);
     }
 
+    // The funtion will simply the zeroing accumulator and spilling instrcutions
+    // into simple xxlxor and spilling instrcuctions.
+    // From:
+    // setaccz acci
+    // xxmfacc acci
+    // stxv vsr(i*4+0), D(1)
+    // stxv vsr(i*4+1), D-16(1)
+    // stxv vsr(i*4+2), D-32(1)
+    // stxv vsr(i*4+3), D-48(1)
+
+    // To:
+    // xxlxor vsr(i*4), 0, 0
+    // stxv vsr(i*4), D(1)
+    // stxv vsr(i*4), D-16(1)
+    // stxv vsr(i*4), D-32(1)
+    // stxv vsr(i*4), D-48(1)
+    bool
+    OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
+                                       const TargetRegisterInfo *TRI) const {
+      bool changed = false;
+      for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
+        if (BBI->getOpcode() != PPC::XXSETACCZ)
+          continue;
+
+        Register ACCZReg = BBI->getOperand(0).getReg();
+
+        DenseSet<MachineInstr *> InstrsToErase;
+        InstrsToErase.insert(&*BBI++);
+
+        if (BBI->getOpcode() != PPC::XXMFACC) {
+	  --BBI;
+          continue;
+	}
+
+        Register ACCWReg = BBI->getOperand(0).getReg();
+
+        if (ACCWReg != ACCZReg) 
+          continue;
+
+        auto XXMFACCInstr = BBI;
+        InstrsToErase.insert(&*BBI++);
+
+        Register VSLRegBase = (ACCWReg - PPC::ACC0) * 4 + PPC::VSL0;
+        bool isVSLRegBaseKilled = false;
+        for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+          if (BBI->getOpcode() == PPC::STXV) {
+            Register Reg0 = BBI->getOperand(0).getReg();
+            // If the VSLRegBase Register is killed, we put the kill in the
+            // last STXV instruction.
+            if (Reg0 == VSLRegBase && BBI->getOperand(0).isKill())
+              isVSLRegBaseKilled = true;
+            if (Reg0 < VSLRegBase || Reg0 > VSLRegBase + 3)
+              continue;
+          } else {
+	      --BBI;
+              continue;
+	  }
+          }
+
+          BBI = XXMFACCInstr;
+          BBI++;
+          for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
+            Register VSLiReg = BBI->getOperand(0).getReg();
+            BBI->substituteRegister(VSLiReg, VSLRegBase, 0, *TRI);
+            BBI->getOperand(0).setIsKill(false);
+          }
+
+          if (isVSLRegBaseKilled)
+            (--BBI)->getOperand(0).setIsKill(true);
+
+          DebugLoc DL = XXMFACCInstr->getDebugLoc();
+          const PPCInstrInfo *TII = XXMFACCInstr->getMF()
+                                        ->getSubtarget<PPCSubtarget>()
+                                        .getInstrInfo();
+
+          BuildMI(MBB, &*XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
+              .addReg(VSLRegBase,RegState::Undef)
+              .addReg(VSLRegBase,RegState::Undef);
+
+          for (MachineInstr *MI : InstrsToErase)
+            MI->eraseFromParent();
+
+          changed |= true;
+        }
+      return changed;
+    }
+
     // This function removes any redundant load immediates. It has two level
     // loops - The outer loop finds the load immediates BBI that could be used
     // to replace following redundancy. The inner loop scans instructions that
@@ -466,6 +553,7 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
         Changed |= removeRedundantLIs(MBB, TRI);
         Changed |= addLinkerOpt(MBB, TRI);
         Changed |= removeAccPrimeUnprime(MBB);
+        Changed |= OptimizeZeroingAccumulatorSpilling(MBB, TRI);
         for (MachineInstr &MI : MBB) {
           unsigned Opc = MI.getOpcode();
           if (Opc == PPC::UNENCODED_NOP) {
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 53b0a2737122e1..17e24eefc25805 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -115,22 +115,20 @@ declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
 define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-LABEL: int_xxsetaccz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxsetaccz acc0
-; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    xxlxor vs0, vs0, vs0
 ; CHECK-NEXT:    stxv vs0, 48(r3)
-; CHECK-NEXT:    stxv vs1, 32(r3)
-; CHECK-NEXT:    stxv vs2, 16(r3)
-; CHECK-NEXT:    stxv vs3, 0(r3)
+; CHECK-NEXT:    stxv vs0, 32(r3)
+; CHECK-NEXT:    stxv vs0, 16(r3)
+; CHECK-NEXT:    stxv vs0, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: int_xxsetaccz:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsetaccz acc0
-; CHECK-BE-NEXT:    xxmfacc acc0
-; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxlxor vs0, vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
 ; CHECK-BE-NEXT:    stxv vs0, 0(r3)
-; CHECK-BE-NEXT:    stxv vs3, 48(r3)
-; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -143,22 +141,20 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble
 define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-LABEL: disass_acc:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xxsetaccz acc0
-; CHECK-NEXT:    xxmfacc acc0
-; CHECK-NEXT:    stxv vs3, 0(r3)
-; CHECK-NEXT:    stxv vs2, 0(r4)
-; CHECK-NEXT:    stxv vs1, 0(r5)
+; CHECK-NEXT:    xxlxor vs0, vs0, vs0 
+; CHECK-NEXT:    stxv vs0, 0(r3)
+; CHECK-NEXT:    stxv vs0, 0(r4)
+; CHECK-NEXT:    stxv vs0, 0(r5)
 ; CHECK-NEXT:    stxv vs0, 0(r6)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: disass_acc:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    xxsetaccz acc0
-; CHECK-BE-NEXT:    xxmfacc acc0
+; CHECK-BE-NEXT:    xxlxor vs0, vs0, vs0
 ; CHECK-BE-NEXT:    stxv vs0, 0(r3)
-; CHECK-BE-NEXT:    stxv vs1, 0(r4)
-; CHECK-BE-NEXT:    stxv vs2, 0(r5)
-; CHECK-BE-NEXT:    stxv vs3, 0(r6)
+; CHECK-BE-NEXT:    stxv vs0, 0(r4)
+; CHECK-BE-NEXT:    stxv vs0, 0(r5)
+; CHECK-BE-NEXT:    stxv vs0, 0(r6)
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()

>From 2f8614ac27522e894144751a6fb9c855d8a37a50 Mon Sep 17 00:00:00 2001
From: zhijian <zhijian at ca.ibm.com>
Date: Fri, 9 Aug 2024 15:38:33 -0400
Subject: [PATCH 2/2] deal with not adjacent instruction

---
 .../lib/Target/PowerPC/PPCPreEmitPeephole.cpp | 162 ++++++++++++------
 llvm/test/CodeGen/PowerPC/mma-intrinsics.ll   |  18 +-
 2 files changed, 117 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index 4dac9d1708d8a1..06bcbe15465a65 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -114,86 +114,142 @@ static bool hasPCRelativeForm(MachineInstr &Use) {
     // From:
     // setaccz acci
     // xxmfacc acci
-    // stxv vsr(i*4+0), D(1)
-    // stxv vsr(i*4+1), D-16(1)
-    // stxv vsr(i*4+2), D-32(1)
-    // stxv vsr(i*4+3), D-48(1)
+    // stxv vsr(i*4+0), D(Base)
+    // stxv vsr(i*4+1), D-16(Base)
+    // stxv vsr(i*4+2), D-32(Base)
+    // stxv vsr(i*4+3), D-48(Base)
 
     // To:
     // xxlxor vsr(i*4), 0, 0
-    // stxv vsr(i*4), D(1)
-    // stxv vsr(i*4), D-16(1)
-    // stxv vsr(i*4), D-32(1)
-    // stxv vsr(i*4), D-48(1)
+    // stxv vsr(i*4), D(Base)
+    // stxv vsr(i*4), D-16(Base)
+    // stxv vsr(i*4), D-32(Base)
+    // stxv vsr(i*4), D-48(Base)
     bool
     OptimizeZeroingAccumulatorSpilling(MachineBasicBlock &MBB,
                                        const TargetRegisterInfo *TRI) const {
-      bool changed = false;
+      bool Changed = false;
+      DenseSet<MachineInstr *> InstrsToErase;
       for (auto BBI = MBB.instr_begin(); BBI != MBB.instr_end(); ++BBI) {
         if (BBI->getOpcode() != PPC::XXSETACCZ)
           continue;
-
         Register ACCZReg = BBI->getOperand(0).getReg();
+        MachineInstr *XXSETACCZInstr = nullptr;
+        MachineInstr *XXMFACCInstr = nullptr;
+        auto STXVInstrIter = MBB.begin();
+        bool isVSLRegBaseKilled = false;
+        Register VSLRegBase;
+
+        XXSETACCZInstr = &*BBI++;
+        for (auto TBBI = BBI; TBBI != MBB.instr_end(); ++TBBI) {
+          if (!XXMFACCInstr) {
+            if (TBBI->getOpcode() != PPC::XXMFACC) {
+              // Check whether the accumulator is redefined between XXSETACCZ
+              // and XXMFACC. we will not optimize them.
+              bool IsACCZRegRedefined = false;
+              for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
+                MachineOperand &Operand = TBBI->getOperand(i);
+                if (!Operand.isReg())
+                  continue;
+                Register OperandReg = Operand.getReg();
+                // Check whether the accumulator `ACCZReg` is redefined.
+                if (OperandReg == ACCZReg && Operand.isDef())
+                  IsACCZRegRedefined = true;
+              }
+              // If the ACCZReg is redefined, not check whether the `XXSETACCZ`
+              // has a corresponding `XXMFACC` any more.
+              if (IsACCZRegRedefined)
+                break;
 
-        DenseSet<MachineInstr *> InstrsToErase;
-        InstrsToErase.insert(&*BBI++);
-
-        if (BBI->getOpcode() != PPC::XXMFACC) {
-	  --BBI;
-          continue;
-	}
-
-        Register ACCWReg = BBI->getOperand(0).getReg();
-
-        if (ACCWReg != ACCZReg) 
-          continue;
+              continue;
+            } else {
+              // Check if XXSETACCZ uses the same accumulator as the `XXMFACC`
+              // instruction.
+              if (TBBI->getOperand(0).getReg() != ACCZReg)
+                continue;
+            }
 
-        auto XXMFACCInstr = BBI;
-        InstrsToErase.insert(&*BBI++);
+            XXMFACCInstr = &*TBBI++;
+            VSLRegBase = (ACCZReg - PPC::ACC0) * 4 + PPC::VSL0;
+          }
 
-        Register VSLRegBase = (ACCWReg - PPC::ACC0) * 4 + PPC::VSL0;
-        bool isVSLRegBaseKilled = false;
-        for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
-          if (BBI->getOpcode() == PPC::STXV) {
-            Register Reg0 = BBI->getOperand(0).getReg();
-            // If the VSLRegBase Register is killed, we put the kill in the
-            // last STXV instruction.
-            if (Reg0 == VSLRegBase && BBI->getOperand(0).isKill())
-              isVSLRegBaseKilled = true;
-            if (Reg0 < VSLRegBase || Reg0 > VSLRegBase + 3)
-              continue;
+          // Check whether it is a PPC::STXV instruction.
+          if (TBBI->getOpcode() != PPC::STXV) {
+            bool isVSLRedefinedOrUsed = false;
+            // Check whether the VSL register mapped to ACCWReg is redefined or
+            // used by non-STXV instructions.
+            for (unsigned i = 0; i < TBBI->getNumOperands(); i++) {
+              MachineOperand &Operand = TBBI->getOperand(i);
+              if (!Operand.isReg())
+                continue;
+              Register OperandReg = Operand.getReg();
+              Register VSRpBase = (ACCZReg - PPC::ACC0) * 2 + PPC::VSRp0;
+              if ((OperandReg >= VSLRegBase && OperandReg <= VSLRegBase + 3) ||
+                  (OperandReg > VSRpBase && OperandReg <= VSRpBase + 1)) {
+                isVSLRedefinedOrUsed = true;
+                break;
+              }
+            }
+            // If the VSL register mapped to ACCWReg is redefined or used by a
+            // non-STXV instruction, we will not perform the optimization.
+            if (isVSLRedefinedOrUsed) {
+              XXMFACCInstr = nullptr;
+              break;
+            }
           } else {
-	      --BBI;
-              continue;
-	  }
+            // Check whether there are four STXV instructions continuously.
+            STXVInstrIter = TBBI;
+            for (unsigned InstrCount = 0; InstrCount < 4;
+                 ++InstrCount, ++TBBI) {
+              if (TBBI->getOpcode() == PPC::STXV) {
+                Register Reg0 = TBBI->getOperand(0).getReg();
+                // If the VSLRegBase Register is killed, we put the kill in the
+                // last STXV instruction.
+                // FIXME: We may need to update killed flag for other vsr as
+                // well.
+                if (Reg0 == VSLRegBase && TBBI->getOperand(0).isKill())
+                  isVSLRegBaseKilled = true;
+                if (Reg0 >= VSLRegBase && Reg0 <= VSLRegBase + 3)
+                  continue;
+                // The register operand of the STXV instruction is not a VSL
+                // register mapped to ACCWReg.
+                XXMFACCInstr = nullptr;
+                break;
+              }
+            }
           }
+          // There are four consecutive STXV instructions.
+          break;
+        }
 
-          BBI = XXMFACCInstr;
-          BBI++;
-          for (unsigned InstrCount = 0; InstrCount < 4; ++InstrCount, ++BBI) {
-            Register VSLiReg = BBI->getOperand(0).getReg();
-            BBI->substituteRegister(VSLiReg, VSLRegBase, 0, *TRI);
-            BBI->getOperand(0).setIsKill(false);
+        if (XXMFACCInstr && STXVInstrIter != MBB.begin()) {
+          for (unsigned InstrCount = 0; InstrCount < 4;
+               ++InstrCount, ++STXVInstrIter) {
+            Register VSLReg = STXVInstrIter->getOperand(0).getReg();
+            STXVInstrIter->substituteRegister(VSLReg, VSLRegBase, 0, *TRI);
+            STXVInstrIter->getOperand(0).setIsKill(false);
           }
 
           if (isVSLRegBaseKilled)
-            (--BBI)->getOperand(0).setIsKill(true);
+            (--STXVInstrIter)->getOperand(0).setIsKill(true);
 
           DebugLoc DL = XXMFACCInstr->getDebugLoc();
           const PPCInstrInfo *TII = XXMFACCInstr->getMF()
                                         ->getSubtarget<PPCSubtarget>()
                                         .getInstrInfo();
 
-          BuildMI(MBB, &*XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
-              .addReg(VSLRegBase,RegState::Undef)
-              .addReg(VSLRegBase,RegState::Undef);
-
-          for (MachineInstr *MI : InstrsToErase)
-            MI->eraseFromParent();
+          BuildMI(MBB, XXMFACCInstr, DL, TII->get(PPC::XXLXOR), VSLRegBase)
+              .addReg(VSLRegBase, RegState::Undef)
+              .addReg(VSLRegBase, RegState::Undef);
 
-          changed |= true;
+          InstrsToErase.insert(XXSETACCZInstr);
+          InstrsToErase.insert(XXMFACCInstr);
+          Changed |= true;
         }
-      return changed;
+      }
+      for (MachineInstr *MI : InstrsToErase)
+        MI->eraseFromParent();
+      return Changed;
     }
 
     // This function removes any redundant load immediates. It has two level
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 17e24eefc25805..e01b8dc07af5d9 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -536,14 +536,13 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
 ; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r0, 16(r1)
 ; CHECK-NEXT:    stdu r1, -112(r1)
-; CHECK-NEXT:    xxsetaccz acc0
 ; CHECK-NEXT:    xxsetaccz acc1
 ; CHECK-NEXT:    mr r30, r3
-; CHECK-NEXT:    xxmfacc acc0
+; CHECK-NEXT:    xxlxor vs0, vs0, vs0
 ; CHECK-NEXT:    stxv vs0, 48(r3)
-; CHECK-NEXT:    stxv vs1, 32(r3)
-; CHECK-NEXT:    stxv vs2, 16(r3)
-; CHECK-NEXT:    stxv vs3, 0(r3)
+; CHECK-NEXT:    stxv vs0, 32(r3)
+; CHECK-NEXT:    stxv vs0, 16(r3)
+; CHECK-NEXT:    stxv vs0, 0(r3)
 ; CHECK-NEXT:    xvf32gerpp acc1, v2, v2
 ; CHECK-NEXT:    xxmfacc acc1
 ; CHECK-NEXT:    stxv vs4, 80(r1)
@@ -568,15 +567,14 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
 ; CHECK-BE-NEXT:    mflr r0
 ; CHECK-BE-NEXT:    std r0, 16(r1)
 ; CHECK-BE-NEXT:    stdu r1, -192(r1)
-; CHECK-BE-NEXT:    xxsetaccz acc0
 ; CHECK-BE-NEXT:    xxsetaccz acc1
 ; CHECK-BE-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    mr r30, r3
-; CHECK-BE-NEXT:    xxmfacc acc0
-; CHECK-BE-NEXT:    stxv vs1, 16(r3)
+; CHECK-BE-NEXT:    xxlxor vs0, vs0, vs0
+; CHECK-BE-NEXT:    stxv vs0, 16(r3)
 ; CHECK-BE-NEXT:    stxv vs0, 0(r3)
-; CHECK-BE-NEXT:    stxv vs3, 48(r3)
-; CHECK-BE-NEXT:    stxv vs2, 32(r3)
+; CHECK-BE-NEXT:    stxv vs0, 48(r3)
+; CHECK-BE-NEXT:    stxv vs0, 32(r3)
 ; CHECK-BE-NEXT:    xvf32gerpp acc1, v2, v2
 ; CHECK-BE-NEXT:    xxmfacc acc1
 ; CHECK-BE-NEXT:    stxv vs4, 112(r1)



More information about the llvm-commits mailing list