[llvm] [AArch64] Optimization of repeated constant loads (#51483) (PR #86249)

via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 12 01:36:35 PDT 2024


https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/86249

>From 257adb5f5bd9964ebb021062d91656a11c8d52c0 Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Fri, 12 Apr 2024 16:53:37 +0900
Subject: [PATCH 1/2] [AArch64] Optimize `MOV` to `ORR` when load symmetric
 constants (#51483)

This change looks for cases of symmetric constant loading.
`symmetric constant load` is when the upper 32 bits and lower 32 bits
of a 64-bit register load the same value.

When it finds this, it replaces it with an instruction that loads only
the lower 32 bits of the constant and stores it in the upper and lower
bits simultaneously.

For example:
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = MOVKXi $x8, 49370, 32
  renamable $x8 = MOVKXi $x8, 320, 48
becomes
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = ORRXrs $x8, $x8, 32
---
 llvm/lib/Target/AArch64/AArch64ExpandImm.cpp  | 10 +++++++
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 13 +++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    | 27 +++++++++++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
index a7d72b59b1d5a6..2d37d232b8edd9 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp
@@ -509,6 +509,16 @@ static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
     Imm = ~Imm;
 
   unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
+  Shift += 16;
+  Imm16 = (Imm >> Shift) & Mask;
+  if (Imm16 != (isNeg ? Mask : 0))
+    Insn.push_back(
+        {Opc, Imm16, AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)});
+  if (Imm != 0 && (Imm >> 32) == (Imm & UINT_MAX)) {
+    Insn.push_back({BitSize == 32 ? AArch64::ORRWrs : AArch64::ORRXrs, 0, 32});
+    return;
+  }
+
   while (Shift < LastShift) {
     Shift += 16;
     Imm16 = (Imm >> Shift) & Mask;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 03f0778bae59d5..36957bb0f5a059 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -168,6 +168,19 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                 .addImm(I->Op2));
       }
       break;
+    case AArch64::ORRWrs:
+    case AArch64::ORRXrs: {
+      Register DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MIBS.push_back(
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
+              .addReg(DstReg, RegState::Define |
+                                  getDeadRegState(DstIsDead && LastItem) |
+                                  RenamableState)
+              .addReg(DstReg)
+              .addReg(DstReg)
+              .addImm(I->Op2));
+    } break;
     case AArch64::ANDXri:
     case AArch64::EORXri:
       if (I->Op1 == 0) {
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
new file mode 100644
index 00000000000000..641eaf0be23123
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=aarch64 -verify-machineinstrs -run-pass=aarch64-expand-pseudo -run-pass=aarch64-ldst-opt -debug-only=aarch64-ldst-opt %s -o - | FileCheck %s
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load
+# CHECK: renamable $x0 = MOVZXi 49370, 0
+# CHECK: renamable $x0 = MOVKXi $x0, 320, 16
+# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
+name: test_fold_repeating_constant_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    renamable $x0 = MOVi64imm 90284035103834330
+    RET_ReallyLR implicit $x0
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_neg
+# CHECK: renamable $x0 = MOVZXi 320, 0
+# CHECK: renamable $x0 = MOVKXi $x0, 49370, 16
+# CHECK: renamable $x0 = ORRXrs $x0, $x0, 32
+name: test_fold_repeating_constant_load_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    renamable $x0 = MOVi64imm -4550323095879417536
+    RET_ReallyLR implicit $x0

>From b546f76d031872dfdbc49883c396222e2d1a29aa Mon Sep 17 00:00:00 2001
From: Hanbum Park <kese111 at gmail.com>
Date: Fri, 12 Apr 2024 16:54:01 +0900
Subject: [PATCH 2/2] [AArch64] Optimize `MOV` to `STP` when load symmetric
 constants (#51483)

This change looks for cases of symmetric constant loading.
`symmetric constant load` is when the upper 32 bits and lower 32 bits
of a 64-bit register load the same value.

When it finds this, it replaces it with an instruction that loads only
the lower 32 bits of the constant and stores it in the upper and lower
bits simultaneously.

For example:
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = MOVKXi $x8, 49370, 32
  renamable $x8 = MOVKXi $x8, 320, 48
  STRXui killed renamable $x8, killed renamable $x0, 0
becomes
  $w8 = MOVZWi 49370, 0
  $w8 = MOVKWi $w8, 320, 16
  STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 173 ++++++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    |  35 ++++
 2 files changed, 208 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d0adb78b231a76..c316bd66074c8f 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,6 +201,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Finds and collapses loads of repeated constant values.
+  bool foldRepeatedConstantLoads(MachineBasicBlock::iterator &I,
+                                 unsigned Limit);
+  MachineBasicBlock::iterator tryToFoldRepeatedConstantLoads(
+      MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+      int SuccIndex, int Accumulated);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2259,151 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+static bool isRepeatable(MachineInstr &MI, Register BaseReg) {
+  auto MatchBaseReg = [&](unsigned Count) {
+    for (unsigned I = 0; I < Count; I++) {
+      auto OpI = MI.getOperand(I);
+      if (OpI.isReg() && OpI.getReg() != BaseReg)
+        return false;
+    }
+    return true;
+  };
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::MOVZXi:
+    return MatchBaseReg(1);
+  case AArch64::MOVKXi:
+    return MatchBaseReg(2);
+  case AArch64::ORRXrs:
+  case AArch64::ORRWrs:
+    MachineOperand &Imm = MI.getOperand(3);
+    unsigned BitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+    if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == BitSize)
+      return true;
+  }
+
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::tryToFoldRepeatedConstantLoads(
+    MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+    int SuccIndex, int Accumulated) {
+  MachineBasicBlock::iterator I = MI.getIterator();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator FirstMovI;
+  MachineBasicBlock *MBB = MI.getParent();
+  uint64_t Mask = 0xFFFFUL;
+  int Index = 0;
+
+  for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+    if (Index == SuccIndex - 1) {
+      FirstMovI = *MI;
+      break;
+    }
+    (*MI)->eraseFromParent();
+  }
+
+  Register DstRegW =
+      TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+  BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVZWi),
+          DstRegW)
+      .addImm(Accumulated & Mask)
+      .addImm(0);
+  BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(), TII->get(AArch64::MOVKWi),
+          DstRegW)
+      .addUse(DstRegW)
+      .addImm((Accumulated >> 16) & Mask)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+  FirstMovI->eraseFromParent();
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+  DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+  unsigned DstRegState = getRegState(MI.getOperand(0));
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+      .addReg(DstRegW, DstRegState)
+      .addReg(DstRegW, DstRegState)
+      .addReg(MO.getReg(), getRegState(MO))
+      .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+      .setMemRefs(MI.memoperands())
+      .setMIFlags(MI.getFlags());
+  I->eraseFromParent();
+
+  return NextI;
+}
+
+bool AArch64LoadStoreOpt::foldRepeatedConstantLoads(
+    MachineBasicBlock::iterator &I, unsigned Limit) {
+  MachineInstr &MI = *I;
+  if (MI.getOpcode() != AArch64::STRXui)
+    return false;
+
+  MachineBasicBlock::iterator MBBI = I;
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  if (MBBI == B)
+    return false;
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  unsigned Count = 0, SuccIndex = 0, DupBitSize = 0;
+  uint64_t Accumulated = 0;
+  SmallVector<MachineBasicBlock::iterator> MIs;
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+    if (!MI.isTransient())
+      ++Count;
+    if (!isRepeatable(MI, BaseReg)) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      if (!ModifiedRegUnits.available(BaseReg) ||
+          !UsedRegUnits.available(BaseReg))
+        break;
+      continue;
+    }
+
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AArch64::ORRXrs || Opc == AArch64::ORRWrs) {
+      DupBitSize = Opc == AArch64::ORRXrs ? 32 : 16;
+      MIs.push_back(MBBI);
+      continue;
+    }
+    unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
+    MachineOperand Value = MI.getOperand(ValueOrder);
+    MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+    if (!Value.isImm() || !Shift.isImm())
+      return false;
+
+    uint64_t IValue = Value.getImm();
+    uint64_t IShift = Shift.getImm();
+    uint64_t mask = 0xFFFFUL;
+    Accumulated -= (Accumulated & (mask << IShift));
+    Accumulated += (IValue << IShift);
+    MIs.push_back(MBBI);
+
+    if (ValueOrder == 1 && DupBitSize) {
+      Accumulated |= Accumulated << DupBitSize;
+      DupBitSize = 0;
+    }
+
+    if (Accumulated != 0 && (Accumulated >> 32) == (Accumulated & UINT_MAX))
+      SuccIndex = MIs.size();
+  } while (MBBI != B && Count < Limit);
+
+  if (SuccIndex) {
+    I = tryToFoldRepeatedConstantLoads(MI, MIs, SuccIndex, Accumulated);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2518,6 +2670,27 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // We have an opportunity to optimize the `STRXui` instruction, which loads
+  // the same 32-bit value into a register twice. The `STPXi` instruction allows
+  // us to load a 32-bit value only once.
+  // Considering :
+  // mov     x8, 49370
+  // movk    x8, 320, lsl #16
+  // movk    x8, 49370, lsl #32
+  // movk    x8, 320, lsl #48
+  // str     x8, [x0]
+  // Transform :
+  // mov     w8, 49370
+  // movk    w8, 320, lsl #16
+  // stp     w8, w8, [x0]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (foldRepeatedConstantLoads(MBBI, UpdateLimit))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 641eaf0be23123..c8c93eafd8a6cf 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -25,3 +25,38 @@ body:             |
 
     renamable $x0 = MOVi64imm -4550323095879417536
     RET_ReallyLR implicit $x0
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_store_twice
+# CHECK: $w8 = MOVZWi 49370, 0
+# CHECK: $w8 = MOVKWi $w8, 320, 16
+# CHECK: STPWi renamable $w8, renamable $w8, killed renamable $x0, 0
+# CHECK: STRXui killed renamable $x8, killed renamable $x1, 0
+name: test_fold_repeating_constant_load_store_twice
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    renamable $x8 = MOVi64imm 90284035103834330
+    STRXui renamable $x8, killed renamable $x0, 0
+    STRXui killed renamable $x8, killed renamable $x1, 0
+    RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: test_fold_repeating_constant_load_use_reg_before_store
+# CHECK: renamable $x8 = MOVZXi 49370, 0
+# CHECK: renamable $x8 = MOVKXi $x8, 320, 16
+# CHECK: renamable $x8 = ORRXrs $x8, $x8, 32
+# CHECK: renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+# CHECK: STRXui renamable $x8, killed renamable $x0, 0
+name: test_fold_repeating_constant_load_use_reg_before_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+
+    renamable $x8 = MOVi64imm 90284035103834330
+    renamable $x9 = ADDXrs renamable $x8, renamable $x8, 32
+    STRXui renamable $x8, killed renamable $x0, 0
+    RET_ReallyLR



More information about the llvm-commits mailing list