[llvm] [AArch64] Optimization of repeated constant loads (#51483) (PR #86249)

via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 31 11:21:33 PDT 2024


https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/86249

>From 70820348fc655d9a37769214bd60d85a753efa14 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Fri, 22 Mar 2024 14:31:19 +0900
Subject: [PATCH] [AArch64] Optimization of repeated constant loads (#51483)

This change looks for cases where load same constant in the upper and
lower 32-bit sizes of a 64-bit  space. If we found, it loads only the
lower 32-bit constant and replaces it with an instruction that stores
it simultaneously in the upper and lower.

For example:
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = MOVKXi $x8, 49370, 32
  renamable $x8 = MOVKXi $x8, 320, 48
  STRXui killed renamable $x8, killed renamable $x0, 0
becomes
  $w8 = MOVZWi 49370, 0
  $w8 = MOVKWi $w8, 320, 16
  STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
---
 llvm/lib/Target/AArch64/AArch64ExpandImm.cpp  |  10 +
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  13 ++
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 173 ++++++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    |  62 +++++++
 4 files changed, 258 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index a7d72b59b1d5a6..2d37d232b8edd9 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -509,6 +509,16 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
     Imm = ~Imm;
 
   unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  Shift += 16;
+  Imm16 = (Imm >> Shift) & Mask;
+  if (Imm16 != (isNeg ? Mask : 0))
+    Insn.push_back(
+        {Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)});
+  if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) {
+    Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32});
+    return;
+  }
+
   while (Shift < LastShift) {
     Shift += 16;
     Imm16 = (Imm >> Shift) & Mask;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index b2c52b443753dc..acb251abcffbf0 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                 .addImm(I->Op2));
       }
       break;
+    case AArch64::ORRWrs:
+    case AArch64::ORRXrs: {
+      Register DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MIBS.push_back(
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+              .addReg(DstReg, RegState::Define |
+                                  getDeadRegState(DstIsDead && LastItem) |
+                                  RenamableState)
+              .addReg(DstReg)
+              .addReg(DstReg)
+              .addImm(I->Op2));
+    } break;
     case AArch64::ANDXri:
     case AArch64::EORXri:
       if (I->Op1 == 0) {
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 926a89466255ca..d345d3fa0f6024 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -199,6 +199,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Finds and collapses loads of repeated constant values.
+  bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
+                                 unsigned Limit);
+  MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
+      MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+      int SuccIndex, int Accumulated);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2250,6 +2257,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
+  auto MatchBaseReg = [&](unsigned Count) {
+    for (unsigned I = 0; I < Count; I++) {
+      auto OpI = MI.getOperand(I);
+      if (OpI.isReg() && OpI.getReg() != BaseReg)
+        return false;
+    }
+    return true;
+  };
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::MOVZXi:
+    return MatchBaseReg(1);
+  case AArch64::MOVKXi:
+    return MatchBaseReg(2);
+  case AArch64::ORRXrs:
+  case AArch64::ORRWrs:
+    MachineOperand &Imm = MI.getOperand(3);
+    unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+    if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
+      return true;
+  }
+
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
+    MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+    int SuccIndex, int Accumulated) {
+  MachineBasicBlock::iterator I = MI.getIterator();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator FirstMovI;
+  MachineBasicBlock *MBB = MI.getParent();
+  uint64_t Mask = 0xFFFFUL;
+  int Index = 0;
+
+  for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+    if (Index == SuccIndex - 1) {
+      FirstMovI = *MI;
+      break;
+    }
+    (*MI)->eraseFromParent();
+  }
+
+  Register DstRegW =
+      TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+  BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
+          DstRegW)
+      .addImm(Accumulated & Mask)
+      .addImm(0);
+  BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
+          DstRegW)
+      .addUse(DstRegW)
+      .addImm((Accumulated >> 16) & Mask)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+  FirstMovI->eraseFromParent();
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+  DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+  unsigned DstRegState = getRegState(MI.getOperand(0));
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+      .addReg(DstRegW, DstRegState)
+      .addReg(DstRegW, DstRegState)
+      .addReg(MO.getReg(), getRegState(MO))
+      .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+      .setMemRefs(MI.memoperands())
+      .setMIFlags(MI.getFlags());
+  I->eraseFromParent();
+
+  return NextI;
+}
+
+bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
+    MachineBasicBlock::iterator &I, unsigned Limit) {
+  MachineInstr &MI = *I;
+  if (MI.getOpcode() != AArch64::STRXui)
+    return false;
+
+  MachineBasicBlock::iterator MBBI = I;
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  if (MBBI == B)
+    return false;
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
+  uint64_t Accumulated = 0;
+  SmallVector<MachineBasicBlock::iterator> MIs;
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+    if (!MI.isTransient())
+      ++Count;
+    if (!isRepeatable(MI, BaseReg)) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      if (!ModifiedRegUnits.available(BaseReg) ||
+          !UsedRegUnits.available(BaseReg))
+        break;
+      continue;
+    }
+
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
+      DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+      MIs.push_back(MBBI);
+      continue;
+    }
+    unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
+    MachineOperand Value = MI.getOperand(ValueOrder);
+    MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+    if (!Value.isImm() || !Shift.isImm())
+      return false;
+
+    uint64_t IValue = Value.getImm();
+    uint64_t IShift = Shift.getImm();
+    uint64_t mask = 0xFFFFUL;
+    Accumulated -= (Accumulated & (mask << IShift));
+    Accumulated += (IValue << IShift);
+    MIs.push_back(MBBI);
+
+    if (ValueOrder == 1 && DupBitSize) {
+      Accumulated |= Accumulated << DupBitSize;
+      DupBitSize = 0;
+    }
+
+    if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
+      SuccIndex = MIs.size();
+  } while (MBBI != B && Count < Limit);
+
+  if (SuccIndex) {
+    I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2512,6 +2664,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // We have an opportunity to optimize the `STRXui` instruction, which loads
+  // the same 32-bit value into a register twice. The `STPXi` instruction allows
+  // us to load a 32-bit value only once.
+  // Considering :
+  // mov     x8, 49370
+  // movk    x8, 320, lsl #16
+  // movk    x8, 49370, lsl #32
+  // movk    x8, 320, lsl #48
+  // str     x8, [x0]
+  // Transform :
+  // mov     w8, 49370
+  // movk    w8, 320, lsl #16
+  // stp     w8, w8, [x0]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
new file mode 100644
index 00000000000000..c8c93eafd8a6cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load
+# CHECK: renamable $x0 = MOVZXi 49370, 0
+# CHECK: renamable $x0 = MOVKXi $x0, 320, 16
+# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
+name: test_fold_repeating_constant_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    renamable $x0 = MOVi64imm 90284035103834330
+    RET_ReallyLR implicit $x0
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_neg
+# CHECK: renamable $x0 = MOVZXi 320, 0
+# CHECK: renamable $x0 = MOVKXi $x0, 49370, 16
+# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
+name: test_fold_repeating_constant_load_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    renamable $x0 = MOVi64imm -4550323095879417536
+    RET_ReallyLR implicit $x0
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
+# CHECK: $w8 = MOVZWi 49370, 0
+# CHECK: $w8 = MOVKWi $w8, 320, 16
+# CHECK: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
+# CHECK: STRXui killed renamable $x8, killed renamable $x1, 0
+name: test_fold_repeating_constant_load_store_twice
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    renamable $x8 = MOVi64imm 90284035103834330
+    STRXui renamable $x8, killed renamable $x0, 0
+    STRXui killed renamable $x8, killed renamable $x1, 0
+    RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_use_reg_before_store
+# CHECK: renamable $x8 = MOVZXi 49370, 0
+# CHECK: renamable $x8 = MOVKXi $x8, 320, 16
+# CHECK: renamable $x8 = ORRXrs $x8, $x8, 32
+# CHECK: renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+# CHECK: STRXui renamable $x8, killed renamable $x0, 0
+name: test_fold_repeating_constant_load_use_reg_before_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    renamable $x8 = MOVi64imm 90284035103834330
+    renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+    STRXui renamable $x8, killed renamable $x0, 0
+    RET_ReallyLR



More information about the llvm-commits mailing list