[llvm] [AArch64] Prevent the AArch64LoadStoreOptimizer from reordering CFI instructions (PR #101317)

Wed Jul 31 03:20:15 PDT 2024

https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/101317

When AArch64LoadStoreOptimizer pass merges an SP update with a load/store
instruction and needs to adjust unwind information either:
* create the merged instruction at the location of the SP update (so no CFI
  instructions are moved), or
* only move a CFI instruction if the move would not reorder it across other CFI
  instructions

If neither of the above is possible, don't perform the optimisation.

>From 84ef8f8da899eb4bfbc3151e17932ebd39c33a71 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 30 Jul 2024 13:34:19 +0100
Subject: [PATCH 1/2] [AArch64] Fix incorrectly getting the destination reg of
 an insn

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp          | 2 ++
 llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6cd9a1a817086..6902b783d7d65 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4112,6 +4112,7 @@ bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
 }
 
 const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
+  assert(MI.mayLoadOrStore() && "Load or store instruction expected");
   unsigned Idx =
       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
                                                                             : 1;
@@ -4120,6 +4121,7 @@ const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
 
 const MachineOperand &
 AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
+  assert(MI.mayLoadOrStore() && "Load or store instruction expected");
   unsigned Idx =
       AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
                                                                             : 2;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index d0adb78b231a7..6deea4f162db6 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1956,7 +1956,7 @@ maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
       MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
       !(MI.getFlag(MachineInstr::FrameSetup) ||
         MI.getFlag(MachineInstr::FrameDestroy)) ||
-      AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
+      MI.getOperand(0).getReg() != AArch64::SP)
     return End;
 
   const MachineFunction &MF = *MI.getParent()->getParent();
@@ -2006,7 +2006,7 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   if (!AArch64InstrInfo::isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .add(getLdStRegOp(*Update))
+              .add(Update->getOperand(0))
               .add(getLdStRegOp(*I))
               .add(AArch64InstrInfo::getLdStBaseOp(*I))
               .addImm(Value / Scale)
@@ -2015,7 +2015,7 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   } else {
     // Paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .add(getLdStRegOp(*Update))
+              .add(Update->getOperand(0))
               .add(getLdStRegOp(*I, 0))
               .add(getLdStRegOp(*I, 1))
               .add(AArch64InstrInfo::getLdStBaseOp(*I))

>From 011b5413c946050fd3db993ee464f60acdb033ed Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 30 Jul 2024 17:33:58 +0100
Subject: [PATCH 2/2] [AArch64] Prevent the AArch64LoadStoreOptimizer from
 reordering CFI instructions

When AArch64LoadStoreOptimizer pass merges an SP update with a load/store
instruction either:
* create the merged instruction at the location of the SP update (so no CFI
  instructions are moved), or
* only move a CFI instruction if the move would not reorder it across other CFI
  instructions

If neither of the above is possible, don't perform the optimisation.
---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 109 +++++++++++++-----
 llvm/test/CodeGen/AArch64/build-one-lane.ll   |   2 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |  28 ++---
 llvm/test/CodeGen/AArch64/no-reorder-cfi.ll   |  26 +++++
 ...treaming-mode-fixed-length-bit-counting.ll |  18 +--
 ...ing-mode-fixed-length-insert-vector-elt.ll |   6 +-
 llvm/test/CodeGen/AArch64/vector-compress.ll  |  20 ++--
 7 files changed, 142 insertions(+), 67 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/no-reorder-cfi.ll

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 6deea4f162db6..e6ecfaab49a3d 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -176,8 +176,12 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
+  // `MergeEither` is set to true if the combined instruction may be placed
+  // either at the location of the load/store instruction or at the location of
+  // the update intruction.
   MachineBasicBlock::iterator
-  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
+  findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                 bool &MergeEither);
 
   // Find an instruction that updates the base register of the ld/st
   // instruction.
@@ -185,9 +189,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
                             unsigned BaseReg, int Offset);
 
   // Merge a pre- or post-index base register update into a ld/st instruction.
-  MachineBasicBlock::iterator
+  std::optional<MachineBasicBlock::iterator>
   mergeUpdateInsn(MachineBasicBlock::iterator I,
-                  MachineBasicBlock::iterator Update, bool IsPreIdx);
+                  MachineBasicBlock::iterator Update, bool IsForward,
+                  bool IsPreIdx, bool MergeEither);
 
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
@@ -1971,20 +1976,37 @@ maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
   }
 }
 
-MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
-                                     MachineBasicBlock::iterator Update,
-                                     bool IsPreIdx) {
+std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
+    bool IsForward, bool IsPreIdx, bool MergeEither) {
   assert((Update->getOpcode() == AArch64::ADDXri ||
           Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator NextI = next_nodbg(I, E);
 
-  // If updating the SP and the following instruction is CFA offset related CFI
-  // instruction move it after the merged instruction.
-  MachineBasicBlock::iterator CFI =
-      IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
+  // If updating the SP and the following instruction is CFA offset related CFI,
+  // make sure the CFI follows the SP update either by merging at the location
+  // of the update or by moving the CFI after the merged instruction. If unable
+  // to do so, bail.
+  MachineBasicBlock::iterator InsertPt = I;
+  if (IsForward) {
+    assert(IsPreIdx);
+    if (auto CFI = maybeMoveCFI(*Update, next_nodbg(Update, E)); CFI != E) {
+      if (MergeEither) {
+        InsertPt = Update;
+      } else {
+        // Take care not to reorder CFIs.
+        if (std::any_of(std::next(CFI), I, [](const auto &Insn) {
+              return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
+            }))
+          return std::nullopt;
+
+        MachineBasicBlock *MBB = InsertPt->getParent();
+        MBB->splice(std::next(InsertPt), MBB, CFI);
+      }
+    }
+  }
 
   // Return the instruction following the merged instruction, which is
   // the instruction following our unmerged load. Unless that's the add/sub
@@ -2005,7 +2027,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
   if (!AArch64InstrInfo::isPairedLdSt(*I)) {
     // Non-paired instruction.
-    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+    MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
+                  TII->get(NewOpc))
               .add(Update->getOperand(0))
               .add(getLdStRegOp(*I))
               .add(AArch64InstrInfo::getLdStBaseOp(*I))
@@ -2014,7 +2037,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
               .setMIFlags(I->mergeFlagsWith(*Update));
   } else {
     // Paired instruction.
-    MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+    MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
+                  TII->get(NewOpc))
               .add(Update->getOperand(0))
               .add(getLdStRegOp(*I, 0))
               .add(getLdStRegOp(*I, 1))
@@ -2023,10 +2047,6 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
               .setMemRefs(I->memoperands())
               .setMIFlags(I->mergeFlagsWith(*Update));
   }
-  if (CFI != E) {
-    MachineBasicBlock *MBB = I->getParent();
-    MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
-  }
 
   if (IsPreIdx) {
     ++NumPreFolded;
@@ -2174,7 +2194,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
 }
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
-    MachineBasicBlock::iterator I, unsigned Limit) {
+    MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr &MemMI = *I;
@@ -2184,6 +2204,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
   int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
 
+  bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
+  Register DestReg[] = {getLdStRegOp(MemMI, 0).getReg(),
+                        IsPairedInsn ? getLdStRegOp(MemMI, 1).getReg()
+                                     : AArch64::NoRegister};
+
   // If the load/store is the first instruction in the block, there's obviously
   // not any matching update. Ditto if the memory offset isn't zero.
   if (MBBI == B || Offset != 0)
@@ -2191,12 +2216,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   // If the base register overlaps a destination register, we can't
   // merge the update.
   if (!isTagStore(MemMI)) {
-    bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
-    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
-      Register DestReg = getLdStRegOp(MemMI, i).getReg();
-      if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
+    for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
+      if (DestReg[i] == BaseReg || TRI->isSubRegister(BaseReg, DestReg[i]))
         return E;
-    }
   }
 
   const bool BaseRegSP = BaseReg == AArch64::SP;
@@ -2217,6 +2239,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   UsedRegUnits.clear();
   unsigned Count = 0;
   bool MemAcessBeforeSPPreInc = false;
+  MergeEither = true;
   do {
     MBBI = prev_nodbg(MBBI, B);
     MachineInstr &MI = *MBBI;
@@ -2243,6 +2266,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     if (!ModifiedRegUnits.available(BaseReg) ||
         !UsedRegUnits.available(BaseReg))
       return E;
+
+    // If we have a destination register (i.e. a load instruction) and a
+    // destination register is used or modified, then we can only merge forward,
+    // i.e. the combined instruction is put in the place of the memory
+    // instruction. Same applies if we see a memory access or side effects.
+    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
+        (DestReg[0] != AArch64::NoRegister &&
+         !(ModifiedRegUnits.available(DestReg[0]) &&
+           UsedRegUnits.available(DestReg[0]))) ||
+        (DestReg[1] != AArch64::NoRegister &&
+         !(ModifiedRegUnits.available(DestReg[1]) &&
+           UsedRegUnits.available(DestReg[1]))))
+      MergeEither = false;
+
     // Keep track if we have a memory access before an SP pre-increment, in this
     // case we need to validate later that the update amount respects the red
     // zone.
@@ -2399,8 +2436,12 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
   if (Update != E) {
     // Merge the update into the ld/st.
-    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
-    return true;
+    if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
+                                     /*IsPreIdx=*/false,
+                                     /*MergeEither=*/false)) {
+      MBBI = *NextI;
+      return true;
+    }
   }
 
   // Don't know how to handle unscaled pre/post-index versions below, so bail.
@@ -2412,11 +2453,15 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   // ldr x1, [x0]
   //   merged into:
   // ldr x1, [x0, #8]!
-  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
+  bool MergeEither;
+  Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit, MergeEither);
   if (Update != E) {
     // Merge the update into the ld/st.
-    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
-    return true;
+    if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/true,
+                                     /*IsPreIdx=*/true, MergeEither)) {
+      MBBI = *NextI;
+      return true;
+    }
   }
 
   // The immediate in the load/store is scaled by the size of the memory
@@ -2433,8 +2478,12 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
   if (Update != E) {
     // Merge the update into the ld/st.
-    MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
-    return true;
+    if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
+                                     /*IsPreIdx=*/true,
+                                     /*MergeEither=*/false)) {
+      MBBI = *NextI;
+      return true;
+    }
   }
 
   return false;
diff --git a/llvm/test/CodeGen/AArch64/build-one-lane.ll b/llvm/test/CodeGen/AArch64/build-one-lane.ll
index a517ca4a1bb4b..ac37fbc349d7d 100644
--- a/llvm/test/CodeGen/AArch64/build-one-lane.ll
+++ b/llvm/test/CodeGen/AArch64/build-one-lane.ll
@@ -318,9 +318,9 @@ define void @v2f64st(ptr %p, double %s) nounwind {
 define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
 ; CHECK-LABEL: test_lanex_32xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1f
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    mov w10, #30 // =0x1e
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 8b82004388b09..d1258d127d1a9 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -160,9 +160,9 @@ entry:
 define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v4f64_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x3
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    str d2, [x9, x8, lsl #3]
@@ -396,9 +396,9 @@ entry:
 define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v8f32_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x7
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    str s2, [x9, x8, lsl #2]
@@ -561,9 +561,9 @@ entry:
 define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v16f16_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xf
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    str h2, [x9, x8, lsl #1]
@@ -724,9 +724,9 @@ entry:
 define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v32i8_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    and x8, x1, #0x1f
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    strb w0, [x9, x8]
@@ -885,9 +885,9 @@ entry:
 define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v16i16_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    and x8, x1, #0xf
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    strh w0, [x9, x8, lsl #1]
@@ -1114,9 +1114,9 @@ entry:
 define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v8i32_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    and x8, x1, #0x7
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    str w0, [x9, x8, lsl #2]
@@ -1299,9 +1299,9 @@ entry:
 define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) {
 ; CHECK-SD-LABEL: insert_v4i64_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    and x8, x1, #0x3
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    str x0, [x9, x8, lsl #3]
@@ -1465,9 +1465,9 @@ entry:
 define double @extract_v4f64_c(<4 x double> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v4f64_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x3
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldr d0, [x9, x8, lsl #3]
@@ -1673,9 +1673,9 @@ entry:
 define float @extract_v8f32_c(<8 x float> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v8f32_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x7
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldr s0, [x9, x8, lsl #2]
@@ -1832,9 +1832,9 @@ entry:
 define half @extract_v16f16_c(<16 x half> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v16f16_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xf
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldr h0, [x9, x8, lsl #1]
@@ -1990,9 +1990,9 @@ entry:
 define i8 @extract_v32i8_c(<32 x i8> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v32i8_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x1f
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldrb w0, [x9, x8]
@@ -2146,9 +2146,9 @@ entry:
 define i16 @extract_v16i16_c(<16 x i16> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v16i16_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xf
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldrh w0, [x9, x8, lsl #1]
@@ -2379,9 +2379,9 @@ entry:
 define i32 @extract_v8i32_c(<8 x i32> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v8i32_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x7
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldr w0, [x9, x8, lsl #2]
@@ -2562,9 +2562,9 @@ entry:
 define i64 @extract_v4i64_c(<4 x i64> %a, i32 %c) {
 ; CHECK-SD-LABEL: extract_v4i64_c:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    stp q0, q1, [sp, #-32]!
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0x3
 ; CHECK-SD-NEXT:    mov x9, sp
 ; CHECK-SD-NEXT:    ldr x0, [x9, x8, lsl #3]
diff --git a/llvm/test/CodeGen/AArch64/no-reorder-cfi.ll b/llvm/test/CodeGen/AArch64/no-reorder-cfi.ll
new file mode 100644
index 0000000000000..cc7acf6ddfb5e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/no-reorder-cfi.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -homogeneous-prolog-epilog < %s | FileCheck %s
+target triple = "aarch64-linux"
+
+declare void @g(ptr, ptr)
+
+define void @f() minsize {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]!
+; CHECK-NEXT:    str xzr, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x0, sp, #8
+; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    bl g
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    b OUTLINED_FUNCTION_EPILOG_TAIL_x30x29
+entry:
+  %p = alloca i32, align 8
+  %n = alloca i64, align 8
+  store i64 0, ptr %n, align 8
+  call void @g(ptr %p, ptr %n)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index 2a83f13251d76..bd49db8a4c414 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -741,11 +741,11 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #6]
 ; NONEON-NOSVE-NEXT:    ldrb w11, [sp, #4]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    ldrb w10, [sp, #2]
 ; NONEON-NOSVE-NEXT:    ldrb w12, [sp]
 ; NONEON-NOSVE-NEXT:    lsr w13, w9, #1
@@ -814,10 +814,10 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #7]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -939,10 +939,10 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v16i8:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldrb w9, [sp, #15]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -1611,11 +1611,11 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #4]
 ; NONEON-NOSVE-NEXT:    ldrh w10, [sp]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w11, w9, #1
 ; NONEON-NOSVE-NEXT:    lsr w12, w10, #1
 ; NONEON-NOSVE-NEXT:    and w11, w11, #0x55555555
@@ -1657,10 +1657,10 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #6]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -1730,10 +1730,10 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v8i16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #14]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -2089,10 +2089,10 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #4]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -2135,10 +2135,10 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v4i32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr w9, [sp, #12]
+; NONEON-NOSVE-NEXT:    mov w8, #16843009 // =0x1010101
 ; NONEON-NOSVE-NEXT:    lsr w10, w9, #1
 ; NONEON-NOSVE-NEXT:    and w10, w10, #0x55555555
 ; NONEON-NOSVE-NEXT:    sub w9, w9, w10
@@ -2366,10 +2366,10 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 ;
 ; NONEON-NOSVE-LABEL: ctpop_v2i64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    ldr x9, [sp, #8]
+; NONEON-NOSVE-NEXT:    mov x8, #72340172838076673 // =0x101010101010101
 ; NONEON-NOSVE-NEXT:    lsr x10, x9, #1
 ; NONEON-NOSVE-NEXT:    and x10, x10, #0x5555555555555555
 ; NONEON-NOSVE-NEXT:    sub x9, x9, x10
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 3ba61c3335a64..d77473ed8f08e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -518,13 +518,13 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 ;
 ; NONEON-NOSVE-LABEL: insertelement_v2f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [x8, :lo12:.LCPI14_0]
+; NONEON-NOSVE-NEXT:    adrp x8, .LCPI14_0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp]
-; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
+; NONEON-NOSVE-NEXT:    ldr h0, [x8, :lo12:.LCPI14_0]
 ; NONEON-NOSVE-NEXT:    str h1, [sp, #8]
+; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
index fcf5c546f2610..710ea70d678c5 100644
--- a/llvm/test/CodeGen/AArch64/vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -36,37 +36,37 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
 define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
 ; CHECK-LABEL: test_compress_v4i32_with_passthru:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str q2, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    mov w15, #3 ; =0x3
 ; CHECK-NEXT:    shl.4s v1, v1, #31
 ; CHECK-NEXT:    cmlt.4s v1, v1, #0
 ; CHECK-NEXT:    and.16b v3, v1, v3
-; CHECK-NEXT:    str q2, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    mov.s w8, v1[1]
 ; CHECK-NEXT:    fmov w16, s1
-; CHECK-NEXT:    mov x12, sp
 ; CHECK-NEXT:    mov.s w11, v1[2]
-; CHECK-NEXT:    addv.4s s2, v3
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    mov.s w13, v1[3]
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    addv.4s s2, v3
 ; CHECK-NEXT:    bfi x12, x16, #2, #1
 ; CHECK-NEXT:    and x16, x16, #0x1
-; CHECK-NEXT:    mov w15, #3 ; =0x3
 ; CHECK-NEXT:    and x8, x8, #0x1
 ; CHECK-NEXT:    add x8, x16, x8
-; CHECK-NEXT:    fmov w16, s2
 ; CHECK-NEXT:    and x11, x11, #0x1
 ; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    fmov w16, s2
 ; CHECK-NEXT:    add x11, x8, x11
 ; CHECK-NEXT:    orr x8, x9, x8, lsl #2
 ; CHECK-NEXT:    add x13, x11, x13
 ; CHECK-NEXT:    bfi x14, x11, #2, #2
+; CHECK-NEXT:    cmp x13, #3
 ; CHECK-NEXT:    bfi x10, x16, #2, #2
 ; CHECK-NEXT:    mov.s w16, v0[3]
-; CHECK-NEXT:    cmp x13, #3
 ; CHECK-NEXT:    csel x11, x13, x15, lo
 ; CHECK-NEXT:    ldr w10, [x10]
 ; CHECK-NEXT:    str s0, [sp]