[llvm] [AArch64] Optimize when storing symmetry constants (PR #93717)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 26 03:17:47 PDT 2024


https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/93717

>From 45d5738bb044cd239a94e9e9127c18569d9e4eeb Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 25 May 2024 16:54:40 +0900
Subject: [PATCH 1/7] [AArch64] Add PreTest for storing symmetry constant

---
 .../CodeGen/AArch64/movimm-expand-ldst.ll     | 159 ++++++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    | 102 +++++++++++
 2 files changed, 261 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index b25ac96f97c7d..b4931043c2353 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -93,3 +93,162 @@ define i64 @testuu0xf555f555f555f555() {
 ; CHECK-NEXT:    ret
   ret i64 u0xf555f555f555f555
 }
+
+define void @test_store_0x1234567812345678(ptr %x) {
+; CHECK-LABEL: test_store_0x1234567812345678:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
+; CHECK-NEXT:    orr x8, x8, x8, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x1234567812345678, ptr %x
+  ret void
+}
+
+define void @test_store_0xff3456ffff3456ff(ptr %x) {
+; CHECK-LABEL: test_store_0xff3456ffff3456ff:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22271 // =0x56ff
+; CHECK-NEXT:    movk x8, #65332, lsl #16
+; CHECK-NEXT:    orr x8, x8, x8, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0xff3456ffff3456ff, ptr %x
+  ret void
+}
+
+define void @test_store_0x00345600345600(ptr %x) {
+; CHECK-LABEL: test_store_0x00345600345600:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22016 // =0x5600
+; CHECK-NEXT:    movk x8, #52, lsl #16
+; CHECK-NEXT:    movk x8, #13398, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x00345600345600, ptr %x
+  ret void
+}
+
+define void @test_store_0x5555555555555555(ptr %x) {
+; CHECK-LABEL: test_store_0x5555555555555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x5555555555555555, ptr %x
+  ret void
+}
+
+define void @test_store_0x5055555550555555(ptr %x) {
+; CHECK-LABEL: test_store_0x5055555550555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    and x8, x8, #0xf0fffffff0ffffff
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x5055555550555555, ptr %x
+  ret void
+}
+
+define void @test_store_0x0000555555555555(ptr %x) {
+; CHECK-LABEL: test_store_0x0000555555555555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    movk x8, #0, lsl #48
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x0000555555555555, ptr %x
+  ret void
+}
+
+define void @test_store_0x0000555500005555(ptr %x) {
+; CHECK-LABEL: test_store_0x0000555500005555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #21845 // =0x5555
+; CHECK-NEXT:    movk x8, #21845, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x0000555500005555, ptr %x
+  ret void
+}
+
+define void @test_store_0x5555000055550000(ptr %x) {
+; CHECK-LABEL: test_store_0x5555000055550000:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #1431633920 // =0x55550000
+; CHECK-NEXT:    movk x8, #21845, lsl #48
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x5555000055550000, ptr %x
+  ret void
+}
+
+define void @test_store_u0xffff5555ffff5555(ptr %x) {
+; CHECK-LABEL: test_store_u0xffff5555ffff5555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-43691 // =0xffffffffffff5555
+; CHECK-NEXT:    movk x8, #21845, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0xffff5555ffff5555, ptr %x
+  ret void
+}
+
+define void @test_store_0x8888ffff8888ffff(ptr %x) {
+; CHECK-LABEL: test_store_0x8888ffff8888ffff:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-2004287489 // =0xffffffff8888ffff
+; CHECK-NEXT:    movk x8, #34952, lsl #48
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0x8888ffff8888ffff, ptr %x
+  ret void
+}
+
+define void @test_store_uu0xfffff555f555f555(ptr %x) {
+; CHECK-LABEL: test_store_uu0xfffff555f555f555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #-2731 // =0xfffffffffffff555
+; CHECK-NEXT:    movk x8, #62805, lsl #16
+; CHECK-NEXT:    movk x8, #62805, lsl #32
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0xfffff555f555f555, ptr %x
+  ret void
+}
+
+define void @test_store_uu0xf555f555f555f555(ptr %x) {
+; CHECK-LABEL: test_store_uu0xf555f555f555f555:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    orr x8, x8, #0xe001e001e001e001
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  store i64 u0xf555f555f555f555, ptr %x
+  ret void
+}
+
+define void @test_store_0x1234567812345678_offset_range(ptr %x) {
+  %g = getelementptr i64, ptr %x, i64 4
+  store i64 u0x1234567812345678, ptr %g
+  ret void
+}
+
+define void @test_store_0x1234567812345678_offset_min(ptr %x) {
+  %g = getelementptr i1, ptr %x, i32 0
+  store i64 u0x1234567812345678, ptr %g
+  ret void
+}
+
+define void @test_store_0x1234567812345678_offset_max(ptr %x) {
+  %g = getelementptr i1, ptr %x, i32 248
+  store i64 u0x1234567812345678, ptr %g
+  ret void
+}
+
+define void @test_store_0x1234567812345678_offset_max_over(ptr %x) {
+  %g = getelementptr i1, ptr %x, i32 249
+  store i64 u0x1234567812345678, ptr %g
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 72529807d5d54..30d8a50a20974 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -32,3 +32,105 @@ body:             |
     ; CHECK-NEXT: RET undef $lr, implicit $x0
     renamable $x0 = MOVi64imm -4550323095879417536
     RET_ReallyLR implicit $x0
+...
+---
+name: test_fold_repeating_constant_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
+    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: RET undef $lr
+    renamable $x8 = MOVi64imm 90284035103834330
+    STRXui killed renamable $x8, killed renamable $x0, 0
+    RET_ReallyLR
+...
+---
+name: test_fold_repeating_constant_store_neg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_neg
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
+    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: RET undef $lr
+    renamable $x8 = MOVi64imm -4550323095879417536
+    STRXui killed renamable $x8, killed renamable $x0, 0
+    RET_ReallyLR
+...
+---
+name: test_fold_repeating_constant_store_16bit_unit
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 21845, 48
+    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: RET undef $lr
+    renamable $x8 = MOVZXi 21845, 16
+    renamable $x8 = MOVKXi $x8, 21845, 48
+    STRXui killed renamable $x8, killed renamable $x0, 0
+    RET undef $lr
+...
+---
+name: test_fold_repeating_constant_store_offset_min
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    renamable $x8 = MOVZXi 22136, 0
+    renamable $x8 = MOVKXi $x8, 4660, 16
+    renamable $x8 = ORRXrs $x8, $x8, 32
+    STRXui killed renamable $x8, killed renamable $x0, -32
+    RET undef $lr
+...
+---
+name: test_fold_repeating_constant_store_offset_max
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    renamable $x8 = MOVZXi 22136, 0
+    renamable $x8 = MOVKXi $x8, 4660, 16
+    renamable $x8 = ORRXrs $x8, $x8, 32
+    STRXui killed renamable $x8, killed renamable $x0, 31
+    RET undef $lr
+...
+---
+name: test_fold_repeating_constant_store_offset_min_lower
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    renamable $x8 = MOVZXi 22136, 0
+    renamable $x8 = MOVKXi $x8, 4660, 16
+    renamable $x8 = ORRXrs $x8, $x8, 32
+    STRXui killed renamable $x8, killed renamable $x0, -33
+    RET undef $lr
+...
+---   
+name: test_fold_repeating_constant_store_offset_max_over
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    renamable $x8 = MOVZXi 22136, 0
+    renamable $x8 = MOVKXi $x8, 4660, 16
+    renamable $x8 = ORRXrs $x8, $x8, 32
+    STRXui killed renamable $x8, killed renamable $x0, 32
+    RET undef $lr

>From bc663318ee138a88c895408f4ee39ad396f3009e Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Fri, 22 Mar 2024 14:31:19 +0900
Subject: [PATCH 2/7] [AArch64] Optimize when storing symmetry constants

This change looks for instructions of storing symmetric constants
instruction 32-bit units. usually consisting of several 'MOV' and
one or less 'ORR'.

If found, load only the lower 32-bit constant and change it to copy
and save to the upper 32-bit using the 'STP' instruction.

For example:
  renamable $x8 = MOVZXi 49370, 0
  renamable $x8 = MOVKXi $x8, 320, 16
  renamable $x8 = ORRXrs $x8, $x8, 32
  STRXui killed renamable $x8, killed renamable $x0, 0
becomes
  $w8 = MOVZWi 49370, 0
  $w8 = MOVKWi $w8, 320, 16
  STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 189 ++++++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.ll     |  24 +--
 .../CodeGen/AArch64/movimm-expand-ldst.mir    |  19 +-
 3 files changed, 207 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d0adb78b231a7..09d9fa11a4959 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -201,6 +201,14 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Finds and collapses loads of symmetric constant value.
+  bool tryFoldSymmetryConstantLoad(MachineBasicBlock::iterator &I,
+                                   unsigned Limit);
+  MachineBasicBlock::iterator
+  doFoldSymmetryConstantLoad(MachineInstr &MI,
+                             SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+                             int SuccIndex, bool hasORR, int Accumulated);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -2252,6 +2260,167 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+static bool isSymmetric(MachineInstr &MI, Register BaseReg) {
+  auto MatchBaseReg = [&](unsigned Count) {
+    for (unsigned I = 0; I < Count; I++) {
+      auto OpI = MI.getOperand(I);
+      if (OpI.isReg() && OpI.getReg() != BaseReg)
+        return false;
+    }
+    return true;
+  };
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::MOVZXi:
+    return MatchBaseReg(1);
+  case AArch64::MOVKXi:
+    return MatchBaseReg(2);
+  case AArch64::ORRXrs:
+    MachineOperand &Imm = MI.getOperand(3);
+    // Fourth operand of ORR must be 32 which mean
+    // 32bit symmetric constant load.
+    // ex) renamable $x8 = ORRXrs $x8, $x8, 32
+    if (MatchBaseReg(3) && Imm.isImm() && Imm.getImm() == 32)
+      return true;
+  }
+
+  return false;
+}
+
+MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
+    MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
+    int SuccIndex, bool hasORR, int Accumulated) {
+  MachineBasicBlock::iterator I = MI.getIterator();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator FirstMovI;
+  MachineBasicBlock *MBB = MI.getParent();
+  uint64_t Mask = 0xFFFFUL;
+  Register DstRegW;
+
+  if (hasORR) {
+    (*MIs.begin())->eraseFromParent();
+  } else {
+    int Index = 0;
+    for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
+      if (Index == SuccIndex - 1) {
+        FirstMovI = *MI;
+        break;
+      }
+      (*MI)->eraseFromParent();
+    }
+    DstRegW =
+        TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
+
+    int Lower = Accumulated & Mask;
+    if (Lower) {
+      BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
+              TII->get(AArch64::MOVZWi), DstRegW)
+          .addImm(Lower)
+          .addImm(0);
+      Lower = (Accumulated >> 16) & Mask;
+      if (Lower) {
+        BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
+                TII->get(AArch64::MOVKWi), DstRegW)
+            .addUse(DstRegW)
+            .addImm(Lower)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+      }
+    } else {
+      Lower = Accumulated >> 16 & Mask;
+      BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
+              TII->get(AArch64::MOVZWi), DstRegW)
+          .addImm(Lower)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
+    }
+    FirstMovI->eraseFromParent();
+  }
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
+  DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+  unsigned DstRegState = getRegState(MI.getOperand(0));
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
+      .addReg(DstRegW, DstRegState)
+      .addReg(DstRegW, DstRegState)
+      .addReg(MO.getReg(), getRegState(MO))
+      .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+      .setMemRefs(MI.memoperands())
+      .setMIFlags(MI.getFlags());
+  I->eraseFromParent();
+
+  return NextI;
+}
+
+bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
+    MachineBasicBlock::iterator &I, unsigned Limit) {
+  MachineInstr &MI = *I;
+  if (MI.getOpcode() != AArch64::STRXui)
+    return false;
+
+  MachineBasicBlock::iterator MBBI = I;
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  if (MBBI == B)
+    return false;
+
+  Register BaseReg = getLdStRegOp(MI).getReg();
+  unsigned Count = 0, SuccIndex = 0;
+  bool hasORR = false;
+  SmallVector<MachineBasicBlock::iterator> MIs;
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+
+  uint64_t Accumulated = 0, Mask = 0xFFFFUL;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+    if (!MI.isTransient())
+      ++Count;
+    if (!isSymmetric(MI, BaseReg)) {
+      LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
+                                        TRI);
+      if (!ModifiedRegUnits.available(BaseReg) ||
+          !UsedRegUnits.available(BaseReg))
+        return false;
+      continue;
+    }
+
+    unsigned Opc = MI.getOpcode();
+    if (Opc == AArch64::ORRXrs) {
+      hasORR = true;
+      MIs.push_back(MBBI);
+      continue;
+    }
+    unsigned ValueOrder = Opc == AArch64::MOVZXi ? 1 : 2;
+    MachineOperand Value = MI.getOperand(ValueOrder);
+    MachineOperand Shift = MI.getOperand(ValueOrder + 1);
+    if (!Value.isImm() || !Shift.isImm())
+      return false;
+
+    uint64_t IValue = Value.getImm();
+    uint64_t IShift = Shift.getImm();
+    Accumulated -= (Accumulated & (Mask << IShift));
+    Accumulated += (IValue << IShift);
+    MIs.push_back(MBBI);
+    if (Accumulated != 0 &&
+        (((Accumulated >> 32) == (Accumulated & 0xffffffffULL)) ||
+         (hasORR && Accumulated >> 32 == 0))) {
+      SuccIndex = MIs.size();
+      break;
+    }
+  } while (MBBI != B && Count < Limit);
+
+  if (SuccIndex) {
+    I = doFoldSymmetryConstantLoad(MI, MIs, SuccIndex, hasORR, Accumulated);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2518,6 +2687,26 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // We have an opportunity to optimize the `STRXui` instruction, which loads
+  // the same 32-bit value into a register twice. The `STPXi` instruction allows
+  // us to load a 32-bit value only once.
+  // Considering :
+  // renamable $x8 = MOVZXi 49370, 0
+  // renamable $x8 = MOVKXi $x8, 320, 16
+  // renamable $x8 = ORRXrs $x8, $x8, 32
+  // STRXui killed renamable $x8, killed renamable $x0, 0
+  // Transform :
+  // $w8 = MOVZWi 49370, 0
+  // $w8 = MOVKWi $w8, 320, 16
+  // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    if (tryFoldSymmetryConstantLoad(MBBI, UpdateLimit))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index b4931043c2353..6729292b49d73 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -97,10 +97,9 @@ define i64 @testuu0xf555f555f555f555() {
 define void @test_store_0x1234567812345678(ptr %x) {
 ; CHECK-LABEL: test_store_0x1234567812345678:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #22136 // =0x5678
-; CHECK-NEXT:    movk x8, #4660, lsl #16
-; CHECK-NEXT:    orr x8, x8, x8, lsl #32
-; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov w8, #22136 // =0x5678
+; CHECK-NEXT:    movk w8, #4660, lsl #16
+; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x1234567812345678, ptr %x
   ret void
@@ -109,10 +108,9 @@ define void @test_store_0x1234567812345678(ptr %x) {
 define void @test_store_0xff3456ffff3456ff(ptr %x) {
 ; CHECK-LABEL: test_store_0xff3456ffff3456ff:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #22271 // =0x56ff
-; CHECK-NEXT:    movk x8, #65332, lsl #16
-; CHECK-NEXT:    orr x8, x8, x8, lsl #32
-; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov w8, #22271 // =0x56ff
+; CHECK-NEXT:    movk w8, #65332, lsl #16
+; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0xff3456ffff3456ff, ptr %x
   ret void
@@ -165,9 +163,8 @@ define void @test_store_0x0000555555555555(ptr %x) {
 define void @test_store_0x0000555500005555(ptr %x) {
 ; CHECK-LABEL: test_store_0x0000555500005555:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #21845 // =0x5555
-; CHECK-NEXT:    movk x8, #21845, lsl #32
-; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov w8, #21845 // =0x5555
+; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x0000555500005555, ptr %x
   ret void
@@ -176,9 +173,8 @@ define void @test_store_0x0000555500005555(ptr %x) {
 define void @test_store_0x5555000055550000(ptr %x) {
 ; CHECK-LABEL: test_store_0x5555000055550000:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #1431633920 // =0x55550000
-; CHECK-NEXT:    movk x8, #21845, lsl #48
-; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov w8, #1431633920 // =0x55550000
+; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x5555000055550000, ptr %x
   ret void
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index 30d8a50a20974..f217f7c0a7f83 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -42,10 +42,9 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
-    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
-    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
-    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: $w8 = MOVZWi 49370, 0
+    ; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
+    ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVi64imm 90284035103834330
     STRXui killed renamable $x8, killed renamable $x0, 0
@@ -60,10 +59,9 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store_neg
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
-    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
-    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
-    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: $w8 = MOVZWi 320, 0
+    ; CHECK-NEXT: $w8 = MOVKWi $w8, 49370, 16
+    ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVi64imm -4550323095879417536
     STRXui killed renamable $x8, killed renamable $x0, 0
@@ -78,9 +76,8 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
-    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 21845, 48
-    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 0
+    ; CHECK-NEXT: $w8 = MOVZWi 21845, 16
+    ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 21845, 16
     renamable $x8 = MOVKXi $x8, 21845, 48

>From ec880b1d43d3bfbe6a71db806ab6c00d3ba18227 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Wed, 12 Jun 2024 04:32:38 +0900
Subject: [PATCH 3/7] remove only not necessary instead rebuild load sequence

---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 74 ++++++-------------
 .../CodeGen/AArch64/movimm-expand-ldst.ll     | 12 +--
 .../CodeGen/AArch64/movimm-expand-ldst.mir    | 10 +--
 3 files changed, 35 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 09d9fa11a4959..294d28b9b8b95 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -207,7 +207,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   MachineBasicBlock::iterator
   doFoldSymmetryConstantLoad(MachineInstr &MI,
                              SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
-                             int SuccIndex, bool hasORR, int Accumulated);
+                             int UpperLoadIdx, int Accumulated);
 
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
@@ -2260,7 +2260,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
-static bool isSymmetric(MachineInstr &MI, Register BaseReg) {
+static bool isSymmetricLoadCandidate(MachineInstr &MI, Register BaseReg) {
   auto MatchBaseReg = [&](unsigned Count) {
     for (unsigned I = 0; I < Count; I++) {
       auto OpI = MI.getOperand(I);
@@ -2292,56 +2292,28 @@ static bool isSymmetric(MachineInstr &MI, Register BaseReg) {
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
     MachineInstr &MI, SmallVectorImpl<MachineBasicBlock::iterator> &MIs,
-    int SuccIndex, bool hasORR, int Accumulated) {
+    int UpperLoadIdx, int Accumulated) {
   MachineBasicBlock::iterator I = MI.getIterator();
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
-  MachineBasicBlock::iterator FirstMovI;
   MachineBasicBlock *MBB = MI.getParent();
-  uint64_t Mask = 0xFFFFUL;
-  Register DstRegW;
 
-  if (hasORR) {
+  if (!UpperLoadIdx) {
+    // ORR ensures that previous instructions load lower 32-bit constants.
+    // Remove ORR only.
     (*MIs.begin())->eraseFromParent();
   } else {
+    // We need to remove MOV for upper of 32bit because We know these instrs
+    // is part of symmetric constant.
     int Index = 0;
-    for (auto MI = MIs.begin(), E = MIs.end(); MI != E; ++MI, Index++) {
-      if (Index == SuccIndex - 1) {
-        FirstMovI = *MI;
-        break;
-      }
+    for (auto MI = MIs.begin(); Index < UpperLoadIdx; ++MI, Index++) {
       (*MI)->eraseFromParent();
     }
-    DstRegW =
-        TRI->getSubReg(FirstMovI->getOperand(0).getReg(), AArch64::sub_32);
-
-    int Lower = Accumulated & Mask;
-    if (Lower) {
-      BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
-              TII->get(AArch64::MOVZWi), DstRegW)
-          .addImm(Lower)
-          .addImm(0);
-      Lower = (Accumulated >> 16) & Mask;
-      if (Lower) {
-        BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
-                TII->get(AArch64::MOVKWi), DstRegW)
-            .addUse(DstRegW)
-            .addImm(Lower)
-            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
-      }
-    } else {
-      Lower = Accumulated >> 16 & Mask;
-      BuildMI(*MBB, FirstMovI, FirstMovI->getDebugLoc(),
-              TII->get(AArch64::MOVZWi), DstRegW)
-          .addImm(Lower)
-          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16));
-    }
-    FirstMovI->eraseFromParent();
   }
 
   Register BaseReg = getLdStRegOp(MI).getReg();
   const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
-  DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
+  Register DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
   unsigned DstRegState = getRegState(MI.getOperand(0));
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
       .addReg(DstRegW, DstRegState)
@@ -2351,7 +2323,6 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
       .setMemRefs(MI.memoperands())
       .setMIFlags(MI.getFlags());
   I->eraseFromParent();
-
   return NextI;
 }
 
@@ -2367,19 +2338,18 @@ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
     return false;
 
   Register BaseReg = getLdStRegOp(MI).getReg();
-  unsigned Count = 0, SuccIndex = 0;
-  bool hasORR = false;
+  unsigned Count = 0, UpperLoadIdx = 0;
+  uint64_t Accumulated = 0, Mask = 0xFFFFUL;
+  bool hasORR = false, Found = false;
   SmallVector<MachineBasicBlock::iterator> MIs;
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
-
-  uint64_t Accumulated = 0, Mask = 0xFFFFUL;
   do {
     MBBI = prev_nodbg(MBBI, B);
     MachineInstr &MI = *MBBI;
     if (!MI.isTransient())
       ++Count;
-    if (!isSymmetric(MI, BaseReg)) {
+    if (!isSymmetricLoadCandidate(MI, BaseReg)) {
       LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
                                         TRI);
       if (!ModifiedRegUnits.available(BaseReg) ||
@@ -2402,19 +2372,23 @@ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
 
     uint64_t IValue = Value.getImm();
     uint64_t IShift = Shift.getImm();
-    Accumulated -= (Accumulated & (Mask << IShift));
-    Accumulated += (IValue << IShift);
+    uint64_t Adder = IValue << IShift;
     MIs.push_back(MBBI);
+    if (Adder >> 32)
+      UpperLoadIdx = MIs.size();
+
+    Accumulated -= Accumulated & (Mask << IShift);
+    Accumulated += Adder;
     if (Accumulated != 0 &&
         (((Accumulated >> 32) == (Accumulated & 0xffffffffULL)) ||
-         (hasORR && Accumulated >> 32 == 0))) {
-      SuccIndex = MIs.size();
+         (hasORR && (Accumulated >> 32 == 0)))) {
+      Found = true;
       break;
     }
   } while (MBBI != B && Count < Limit);
 
-  if (SuccIndex) {
-    I = doFoldSymmetryConstantLoad(MI, MIs, SuccIndex, hasORR, Accumulated);
+  if (Found) {
+    I = doFoldSymmetryConstantLoad(MI, MIs, UpperLoadIdx, Accumulated);
     return true;
   }
 
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index 6729292b49d73..946f8dfe54b4b 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -97,8 +97,8 @@ define i64 @testuu0xf555f555f555f555() {
 define void @test_store_0x1234567812345678(ptr %x) {
 ; CHECK-LABEL: test_store_0x1234567812345678:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #22136 // =0x5678
-; CHECK-NEXT:    movk w8, #4660, lsl #16
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
 ; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x1234567812345678, ptr %x
@@ -108,8 +108,8 @@ define void @test_store_0x1234567812345678(ptr %x) {
 define void @test_store_0xff3456ffff3456ff(ptr %x) {
 ; CHECK-LABEL: test_store_0xff3456ffff3456ff:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #22271 // =0x56ff
-; CHECK-NEXT:    movk w8, #65332, lsl #16
+; CHECK-NEXT:    mov x8, #22271 // =0x56ff
+; CHECK-NEXT:    movk x8, #65332, lsl #16
 ; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0xff3456ffff3456ff, ptr %x
@@ -163,7 +163,7 @@ define void @test_store_0x0000555555555555(ptr %x) {
 define void @test_store_0x0000555500005555(ptr %x) {
 ; CHECK-LABEL: test_store_0x0000555500005555:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #21845 // =0x5555
+; CHECK-NEXT:    mov x8, #21845 // =0x5555
 ; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x0000555500005555, ptr %x
@@ -173,7 +173,7 @@ define void @test_store_0x0000555500005555(ptr %x) {
 define void @test_store_0x5555000055550000(ptr %x) {
 ; CHECK-LABEL: test_store_0x5555000055550000:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1431633920 // =0x55550000
+; CHECK-NEXT:    mov x8, #1431633920 // =0x55550000
 ; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
   store i64 u0x5555000055550000, ptr %x
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index f217f7c0a7f83..ce1a05a6f1970 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -42,8 +42,8 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $w8 = MOVZWi 49370, 0
-    ; CHECK-NEXT: $w8 = MOVKWi $w8, 320, 16
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 49370, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 320, 16
     ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVi64imm 90284035103834330
@@ -59,8 +59,8 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store_neg
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $w8 = MOVZWi 320, 0
-    ; CHECK-NEXT: $w8 = MOVKWi $w8, 49370, 16
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 320, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 49370, 16
     ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVi64imm -4550323095879417536
@@ -76,7 +76,7 @@ body:             |
     ; CHECK-LABEL: name: test_fold_repeating_constant_store_16bit_unit
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $w8 = MOVZWi 21845, 16
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 21845, 16
     ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
     ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 21845, 16

>From bd7c55bd01d47742dd8fda584d693a8854818ce5 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Tue, 18 Jun 2024 05:13:13 +0900
Subject: [PATCH 4/7] fix wrong Imm

---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 19 +++++++++--
 .../CodeGen/AArch64/movimm-expand-ldst.ll     | 25 +++++++++++++++
 .../CodeGen/AArch64/movimm-expand-ldst.mir    | 32 ++++++++++++++++++-
 3 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 294d28b9b8b95..2d87d29f4bd8c 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2315,11 +2315,12 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
   const MachineOperand MO = AArch64InstrInfo::getLdStBaseOp(MI);
   Register DstRegW = TRI->getSubReg(BaseReg, AArch64::sub_32);
   unsigned DstRegState = getRegState(MI.getOperand(0));
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STPWi))
       .addReg(DstRegW, DstRegState)
       .addReg(DstRegW, DstRegState)
       .addReg(MO.getReg(), getRegState(MO))
-      .add(AArch64InstrInfo::getLdStOffsetOp(MI))
+      .addImm(Offset * 2)
       .setMemRefs(MI.memoperands())
       .setMIFlags(MI.getFlags());
   I->eraseFromParent();
@@ -2337,6 +2338,19 @@ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
   if (MBBI == B)
     return false;
 
+  TypeSize Scale(0U, false), Width(0U, false);
+  int64_t MinOffset, MaxOffset;
+  if (!AArch64InstrInfo::getMemOpInfo(AArch64::STPWi, Scale, Width, MinOffset,
+                                      MaxOffset))
+    return false;
+
+  // We replace the STRX instruction, which stores 64 bits, with the STPW
+  // instruction, which stores two consecutive 32 bits. therefore, we compare
+  // the offset range with multiplied by two.
+  int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
+  if (Offset * 2 < MinOffset || Offset * 2 > MaxOffset)
+    return false;
+
   Register BaseReg = getLdStRegOp(MI).getReg();
   unsigned Count = 0, UpperLoadIdx = 0;
   uint64_t Accumulated = 0, Mask = 0xFFFFUL;
@@ -2675,7 +2689,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   // STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 0
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    if (tryFoldSymmetryConstantLoad(MBBI, UpdateLimit))
+    if (isMergeableLdStUpdate(*MBBI) &&
+        tryFoldSymmetryConstantLoad(MBBI, UpdateLimit))
       Modified = true;
     else
       ++MBBI;
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index 946f8dfe54b4b..5d88be2a0471e 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -226,24 +226,49 @@ define void @test_store_uu0xf555f555f555f555(ptr %x) {
 }
 
 define void @test_store_0x1234567812345678_offset_range(ptr %x) {
+; CHECK-LABEL: test_store_0x1234567812345678_offset_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
+; CHECK-NEXT:    stp w8, w8, [x0, #32]
+; CHECK-NEXT:    ret
   %g = getelementptr i64, ptr %x, i64 4
   store i64 u0x1234567812345678, ptr %g
   ret void
 }
 
 define void @test_store_0x1234567812345678_offset_min(ptr %x) {
+; CHECK-LABEL: test_store_0x1234567812345678_offset_min:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
+; CHECK-NEXT:    stp w8, w8, [x0]
+; CHECK-NEXT:    ret
   %g = getelementptr i1, ptr %x, i32 0
   store i64 u0x1234567812345678, ptr %g
   ret void
 }
 
 define void @test_store_0x1234567812345678_offset_max(ptr %x) {
+; CHECK-LABEL: test_store_0x1234567812345678_offset_max:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
+; CHECK-NEXT:    stp w8, w8, [x0, #248]
+; CHECK-NEXT:    ret
   %g = getelementptr i1, ptr %x, i32 248
   store i64 u0x1234567812345678, ptr %g
   ret void
 }
 
 define void @test_store_0x1234567812345678_offset_max_over(ptr %x) {
+; CHECK-LABEL: test_store_0x1234567812345678_offset_max_over:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #22136 // =0x5678
+; CHECK-NEXT:    movk x8, #4660, lsl #16
+; CHECK-NEXT:    orr x8, x8, x8, lsl #32
+; CHECK-NEXT:    stur x8, [x0, #249]
+; CHECK-NEXT:    ret
   %g = getelementptr i1, ptr %x, i32 249
   store i64 u0x1234567812345678, ptr %g
   ret void
diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
index ce1a05a6f1970..fa639cb0777d6 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.mir
@@ -90,6 +90,13 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_min
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
+    ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, -64
+    ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 22136, 0
     renamable $x8 = MOVKXi $x8, 4660, 16
     renamable $x8 = ORRXrs $x8, $x8, 32
@@ -102,6 +109,13 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_max
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
+    ; CHECK-NEXT: STPWi killed renamable $w8, killed renamable $w8, killed renamable $x0, 62
+    ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 22136, 0
     renamable $x8 = MOVKXi $x8, 4660, 16
     renamable $x8 = ORRXrs $x8, $x8, 32
@@ -114,18 +128,34 @@ tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_min_lower
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
+    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, -33
+    ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 22136, 0
     renamable $x8 = MOVKXi $x8, 4660, 16
     renamable $x8 = ORRXrs $x8, $x8, 32
     STRXui killed renamable $x8, killed renamable $x0, -33
     RET undef $lr
 ...
----   
+---
 name: test_fold_repeating_constant_store_offset_max_over
 tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x0
+    ; CHECK-LABEL: name: test_fold_repeating_constant_store_offset_max_over
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $x8 = MOVZXi 22136, 0
+    ; CHECK-NEXT: renamable $x8 = MOVKXi $x8, 4660, 16
+    ; CHECK-NEXT: renamable $x8 = ORRXrs $x8, $x8, 32
+    ; CHECK-NEXT: STRXui killed renamable $x8, killed renamable $x0, 32
+    ; CHECK-NEXT: RET undef $lr
     renamable $x8 = MOVZXi 22136, 0
     renamable $x8 = MOVKXi $x8, 4660, 16
     renamable $x8 = ORRXrs $x8, $x8, 32

>From ff31bcefc979061f0d77030abef79b7d62af51ed Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Wed, 26 Jun 2024 07:29:24 +0900
Subject: [PATCH 5/7] fix typo

---
 llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 2d87d29f4bd8c..da94dbd6c31bd 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2303,7 +2303,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::doFoldSymmetryConstantLoad(
     // Remove ORR only.
     (*MIs.begin())->eraseFromParent();
   } else {
-    // We need to remove MOV for upper of 32bit because We know these instrs
+    // We need to remove MOV for upper of 32bit because we know these instrs
     // is part of symmetric constant.
     int Index = 0;
     for (auto MI = MIs.begin(); Index < UpperLoadIdx; ++MI, Index++) {
@@ -2345,7 +2345,7 @@ bool AArch64LoadStoreOpt::tryFoldSymmetryConstantLoad(
     return false;
 
   // We replace the STRX instruction, which stores 64 bits, with the STPW
-  // instruction, which stores two consecutive 32 bits. therefore, we compare
+  // instruction, which stores two consecutive 32 bits. Therefore, we compare
   // the offset range with multiplied by two.
   int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
   if (Offset * 2 < MinOffset || Offset * 2 > MaxOffset)

>From 5d67b1c6e8785111d8563b2b6328e734760d1e25 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Wed, 26 Jun 2024 07:33:23 +0900
Subject: [PATCH 6/7] fix pointer to be of type i8 which have 0 offset

---
 llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index 5d88be2a0471e..99f62fd13c003 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -244,7 +244,7 @@ define void @test_store_0x1234567812345678_offset_min(ptr %x) {
 ; CHECK-NEXT:    movk x8, #4660, lsl #16
 ; CHECK-NEXT:    stp w8, w8, [x0]
 ; CHECK-NEXT:    ret
-  %g = getelementptr i1, ptr %x, i32 0
+  %g = getelementptr i8, ptr %x, i32 0
   store i64 u0x1234567812345678, ptr %g
   ret void
 }

>From 42a804835711a9440ee670c0433a466e230ca418 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Wed, 26 Jun 2024 19:17:31 +0900
Subject: [PATCH 7/7] fix incorrectly used getelementptr type i8 to i1

---
 llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
index 99f62fd13c003..9f4ebf5efb982 100644
--- a/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/movimm-expand-ldst.ll
@@ -256,7 +256,7 @@ define void @test_store_0x1234567812345678_offset_max(ptr %x) {
 ; CHECK-NEXT:    movk x8, #4660, lsl #16
 ; CHECK-NEXT:    stp w8, w8, [x0, #248]
 ; CHECK-NEXT:    ret
-  %g = getelementptr i1, ptr %x, i32 248
+  %g = getelementptr i8, ptr %x, i32 248
   store i64 u0x1234567812345678, ptr %g
   ret void
 }
@@ -269,7 +269,7 @@ define void @test_store_0x1234567812345678_offset_max_over(ptr %x) {
 ; CHECK-NEXT:    orr x8, x8, x8, lsl #32
 ; CHECK-NEXT:    stur x8, [x0, #249]
 ; CHECK-NEXT:    ret
-  %g = getelementptr i1, ptr %x, i32 249
+  %g = getelementptr i8, ptr %x, i32 249
   store i64 u0x1234567812345678, ptr %g
   ret void
 }



More information about the llvm-commits mailing list