[llvm] [CodeGen] Prevent register coalescer rematerialization for zero cycle regmoves (PR #147571)

Wed Jul 9 06:09:06 PDT 2025

https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/147571

>From f98317522c617b08a0fb14bcd792f30b811ea248 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 8 Jul 2025 21:04:30 +0300
Subject: [PATCH 1/4] [AArch64] Add FeatureZCRegMoveFPR128 subtarget feature

Adds a subtarget feature called `FeatureZCRegMoveFPR128` that enables to query wether the target supports zero cycle reg move
for FPR128 NEON registers, and embeds it into the appropriate processors. It prepares for a register coalescer optimization to pre
vent rematerialization of moves where the target supports ZCM.
---
 llvm/lib/Target/AArch64/AArch64Features.td   |  3 +++
 llvm/lib/Target/AArch64/AArch64Processors.td | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 05562516e5198..2fd449ef3f5f8 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -624,6 +624,9 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
 def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
                                         "Has zero-cycle register moves for FPR32 registers">;
 
+def FeatureZCRegMoveFPR128 : SubtargetFeature<"zcm-fpr128", "HasZeroCycleRegMoveFPR128", "true",
+                                        "Has zero-cycle register moves for FPR128 registers">;
+                                        
 def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
                                         "Has zero-cycle zeroing instructions for generic registers">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index dcccde4a4d666..e6cc78d213e8e 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -313,6 +313,7 @@ def TuneAppleA7  : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing,
                                     FeatureZCZeroingFPWorkaround]>;
 
@@ -327,6 +328,7 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
@@ -340,6 +342,7 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
@@ -353,6 +356,7 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
@@ -366,6 +370,7 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
@@ -384,6 +389,7 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
@@ -402,6 +408,7 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
@@ -420,6 +427,7 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
@@ -438,6 +446,7 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
                                     FeatureStorePairSuppress,
                                     FeatureZCRegMoveGPR64,
                                     FeatureZCRegMoveFPR64,
+                                    FeatureZCRegMoveFPR128,
                                     FeatureZCZeroing]>;
 
 def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
@@ -455,6 +464,7 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
                                      FeatureFuseLiterals,
                                      FeatureZCRegMoveGPR64,
                                      FeatureZCRegMoveFPR64,
+                                     FeatureZCRegMoveFPR128,
                                      FeatureZCZeroing
                                      ]>;
 

>From f7e3fd9913dd1bb072a9263aff77129151712461 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 8 Jul 2025 21:05:08 +0300
Subject: [PATCH 2/4] [CodeGen] Add 2 subtarget hooks
 canLowerToZeroCycleReg[Move|Zeroing]

Adds 2 subtarget hooks `canLowerToZeroCycleRegMove` and `canLowerToZeroCycleRegZeroing` to enable query if an instruction
can be lowered to a zero cycle instruction. The logic depends on the microarchitecture. This patch also provide an implementation
for AArch64 based on `AArch64InstrInfo::copyPhysReg` which supports both physical and virtual registers.

It prepares for a register coalescer optimization to prevent rematerialization of moves where the target supports ZCM.
---
 .../llvm/CodeGen/TargetSubtargetInfo.h        | 42 ++++++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  | 81 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.h    | 13 +++
 3 files changed, 136 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 45e67d80629cb..c5a7ed19d54dd 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -185,6 +185,48 @@ class LLVM_ABI TargetSubtargetInfo : public MCSubtargetInfo {
     return false;
   }
 
+  /// Returns true if CopyMI can be lowered to a zero cycle register move.
+  /// Otherwise, returns false.
+  ///
+  /// Lowering to zero cycle register moves depend on the microarchitecture
+  /// for the specific architectural registers and instructions supported.
+  /// Thus, currently its applied after register allocation,
+  /// when `ExpandPostRAPseudos` pass calls `TargetInstrInfo::lowerCopy`
+  /// which in turn calls `TargetInstrInfo::copyPhysReg`.
+  ///
+  /// Subtargets can override this method to classify lowering candidates.
+  /// Note that this cannot be defined in tablegen because it operates at
+  /// a higher level.
+  ///
+  /// NOTE: Subtargets must maintain consistency between the logic here and
+  /// on lowering.
+  virtual bool canLowerToZeroCycleRegMove(const MachineInstr *CopyMI,
+                                          const Register &DestReg,
+                                          const Register &SrcReg) const {
+    return false;
+  }
+
+  /// Returns true if CopyMI can be lowered to a zero cycle register zeroing.
+  /// Otherwise, returns false.
+  ///
+  /// Lowering to zero cycle register zeroing depends on the microarchitecture
+  /// for the specific architectural registers and instructions supported.
+  /// Thus, currently it takes place after register allocation,
+  /// when `ExpandPostRAPseudos` pass calls `TargetInstrInfo::lowerCopy`
+  /// which in turn calls `TargetInstrInfo::copyPhysReg`.
+  ///
+  /// Subtargets can override this method to classify lowering candidates.
+  /// Note that this cannot be defined in tablegen because it operates at
+  /// a higher level.
+  ///
+  /// NOTE: Subtargets must maintain consistency between the logic here and
+  /// on lowering.
+  virtual bool canLowerToZeroCycleRegZeroing(const MachineInstr *CopyMI,
+                                             const Register &DestReg,
+                                             const Register &SrcReg) const {
+    return false;
+  }
+
   /// True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 68ed10570a52f..015c80371e52a 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -667,3 +667,84 @@ AArch64Subtarget::getPtrAuthBlockAddressDiscriminatorIfEnabled(
 bool AArch64Subtarget::enableMachinePipeliner() const {
   return getSchedModel().hasInstrSchedModel();
 }
+
+bool AArch64Subtarget::isRegInClass(const MachineInstr *MI, const Register &Reg,
+                                    const TargetRegisterClass *TRC) const {
+  if (Reg.isPhysical()) {
+    return TRC->contains(Reg);
+  } else {
+    const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+    return TRC->hasSubClassEq(MRI.getRegClass(Reg));
+  }
+}
+
+/// NOTE: must maintain consistency with `AArch64InstrInfo::copyPhysReg`.
+bool AArch64Subtarget::canLowerToZeroCycleRegMove(
+    const MachineInstr *CopyMI, const Register &DestReg,
+    const Register &SrcReg) const {
+  if (isRegInClass(CopyMI, DestReg, &AArch64::GPR32allRegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::GPR32allRegClass) &&
+      DestReg != AArch64::WZR) {
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP ||
+        SrcReg != AArch64::WZR || !hasZeroCycleZeroingGP()) {
+      return hasZeroCycleRegMoveGPR64() || hasZeroCycleRegMoveGPR32();
+    }
+    return false;
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::GPR64allRegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::GPR64allRegClass) &&
+      DestReg != AArch64::XZR) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP ||
+        SrcReg != AArch64::XZR || !hasZeroCycleZeroingGP()) {
+      return hasZeroCycleRegMoveGPR64();
+    }
+    return false;
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::FPR128RegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::FPR128RegClass)) {
+    return isNeonAvailable() && hasZeroCycleRegMoveFPR128();
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::FPR64RegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::FPR64RegClass)) {
+    return hasZeroCycleRegMoveFPR64();
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::FPR32RegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::FPR32RegClass)) {
+    return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64();
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::FPR16RegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::FPR16RegClass)) {
+    return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64();
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::FPR8RegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::FPR8RegClass)) {
+    return hasZeroCycleRegMoveFPR32() || hasZeroCycleRegMoveFPR64();
+  }
+
+  return false;
+}
+
+/// NOTE: must maintain consistency with `AArch64InstrInfo::copyPhysReg`.
+bool AArch64Subtarget::canLowerToZeroCycleRegZeroing(
+    const MachineInstr *CopyMI, const Register &DestReg,
+    const Register &SrcReg) const {
+  if (isRegInClass(CopyMI, DestReg, &AArch64::GPR32allRegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::GPR32allRegClass) &&
+      DestReg != AArch64::WZR) {
+    return AArch64::WZR == SrcReg && hasZeroCycleZeroingGP();
+  }
+
+  if (isRegInClass(CopyMI, DestReg, &AArch64::GPR64allRegClass) &&
+      isRegInClass(CopyMI, SrcReg, &AArch64::GPR64allRegClass) &&
+      DestReg != AArch64::XZR) {
+    return AArch64::XZR == SrcReg && hasZeroCycleZeroingGP();
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index f95b0fafc607f..e08bd3e496479 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -120,6 +120,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   /// Initialize properties based on the selected processor family.
   void initializeProperties(bool HasMinSize);
 
+  /// Returns true if Reg is virtual and is assigned to,
+  /// or is physcial and is a member of, the TRC register class.
+  /// Otherwise, returns false.
+  bool isRegInClass(const MachineInstr *MI, const Register &Reg,
+                    const TargetRegisterClass *TRC) const;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -163,6 +169,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool enableMachinePipeliner() const override;
   bool useDFAforSMS() const override { return false; }
 
+  bool canLowerToZeroCycleRegMove(const MachineInstr *CopyMI,
+                                  const Register &DestReg,
+                                  const Register &SrcReg) const override;
+  bool canLowerToZeroCycleRegZeroing(const MachineInstr *CopyMI,
+                                     const Register &DestReg,
+                                     const Register &SrcReg) const override;
+
   /// Returns ARM processor family.
   /// Avoid this function! CPU specifics should be kept local to this class
   /// and preferably modeled with SubtargetFeatures or properties in

>From 1d33c7fbfc3810d6def1857a57b4be701e2a69c5 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 8 Jul 2025 21:21:31 +0300
Subject: [PATCH 3/4] [CodeGen] Add target hook
 shouldReMaterializeTrivialRegDef

Adds a target hook `shouldReMaterializeTrivialRegDef` that enables target to specify wether rematerialization of the copy is b
eneficial. This patch also provide an implementation for AArch64 based on the new subtarget hooks `canLowerToZeroCycleReg[Move|Zer
oing]`.

It prepares for a register coalescer optimization to prevent rematerialization of moves where the target supports ZCM.
---
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 16 ++++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 10 ++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |  4 ++++
 3 files changed, 30 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index b5b83c7ff1164..4a5b2fb0dd613 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -418,6 +418,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo {
     return true;
   }
 
+  /// Returns true if CopyMI should be considered for register
+  /// definition rematerialization. Otherwise, returns false.
+  ///
+  /// Rematerialization can replace a source register with its value
+  /// from its definition. Its applied in the register coalescer,
+  /// after instruction selection and before register allocation.
+  ///
+  /// Subtargets can override this method to classify rematerialization
+  /// candidates. Note that this cannot be defined in tablegen because it
+  /// operates at a higher level.
+  virtual bool shouldReMaterializeTrivialRegDef(const MachineInstr *CopyMI,
+                                                const Register &DestReg,
+                                                const Register &SrcReg) const {
+    return true;
+  }
+
   /// Re-issue the specified 'original' instruction at the
   /// specific location targeting a new destination register.
   /// The register in Orig->getOperand(0).getReg() will be substituted by
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8847c62690714..ee178a1d64234 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1029,6 +1029,13 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   }
 }
 
+bool AArch64InstrInfo::shouldReMaterializeTrivialRegDef(
+    const MachineInstr *CopyMI, const Register &DestReg,
+    const Register &SrcReg) const {
+  return !Subtarget.canLowerToZeroCycleRegMove(CopyMI, DestReg, SrcReg) &&
+         !Subtarget.canLowerToZeroCycleRegZeroing(CopyMI, DestReg, SrcReg);
+}
+
 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
@@ -5025,6 +5032,9 @@ void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
   }
 }
 
+/// NOTE: must maintain consistency with
+/// `AArch64Subtarget::canLowerToZeroCycleRegMove` and
+/// `AArch64Subtarget::canLowerToZeroCycleRegZeroing`.
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, Register DestReg,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..e858e93cab2a4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -189,6 +189,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
 
   bool isAsCheapAsAMove(const MachineInstr &MI) const override;
 
+  bool shouldReMaterializeTrivialRegDef(const MachineInstr *CopyMI,
+                                        const Register &DestReg,
+                                        const Register &SrcReg) const override;
+
   bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
                              Register &DstReg, unsigned &SubIdx) const override;
 

>From c1eaf8ecc4069b47161d3992306407620c29dedc Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Wed, 9 Jul 2025 16:07:57 +0300
Subject: [PATCH 4/4] [CodeGen] Prevent register coalescer rematerialization
 based on target

This change makes the register coalescer prevent rematerialization of a trivial def for a move instruction, if the target guides against it, based on the new target hook `shouldReMaterializeTrivialRegDef`. The filter is appended to the exiting logic. The patch includes isolated MIR tests for all register classes supported, and fixes existing tests.
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |   9 +
 .../test/CodeGen/AArch64/arm64-abi-varargs.ll |  31 ++--
 ...g-coalesce-remat-zero-cycle-regmov-fpr.mir | 174 ++++++++++++++++++
 ...g-coalesce-remat-zero-cycle-regmov-gpr.mir |  90 +++++++++
 llvm/test/CodeGen/AArch64/arm64-vshuffle.ll   |  12 +-
 .../AArch64/clear-dead-implicit-def-impdef.ll |  53 +++---
 6 files changed, 320 insertions(+), 49 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 2d25f097348af..8f111775ee09d 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -69,6 +69,9 @@ STATISTIC(numCrossRCs, "Number of cross class joins performed");
 STATISTIC(numCommutes, "Number of instruction commuting performed");
 STATISTIC(numExtends, "Number of copies extended");
 STATISTIC(NumReMats, "Number of instructions re-materialized");
+STATISTIC(NumReMatsPrevented,
+          "Number of instruction rematerialization prevented by "
+          "`shouldReMaterializeTrivialRegDef` hook");
 STATISTIC(NumInflated, "Number of register classes inflated");
 STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
 STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved");
@@ -1400,6 +1403,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (!Edit.canRematerializeAt(RM, ValNo, CopyIdx))
     return false;
 
+  if (!TII->shouldReMaterializeTrivialRegDef(CopyMI, DstReg, SrcReg, DefMI)) {
+    LLVM_DEBUG(dbgs() << "Remat prevented: " << CopyIdx << "\t" << *CopyMI);
+    ++NumReMatsPrevented;
+    return false;
+  }
+
   DebugLoc DL = CopyMI->getDebugLoc();
   MachineBasicBlock *MBB = CopyMI->getParent();
   MachineBasicBlock::iterator MII =
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 1b22514a59d60..890367a761281 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -64,18 +64,18 @@ define i32 @main() nounwind ssp {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov w9, #1 ; =0x1
-; CHECK-NEXT:    mov w8, #2 ; =0x2
-; CHECK-NEXT:    stp w8, w9, [sp, #72]
-; CHECK-NEXT:    mov w9, #3 ; =0x3
-; CHECK-NEXT:    mov w8, #4 ; =0x4
-; CHECK-NEXT:    stp w8, w9, [sp, #64]
-; CHECK-NEXT:    mov w9, #5 ; =0x5
-; CHECK-NEXT:    mov w8, #6 ; =0x6
-; CHECK-NEXT:    stp w8, w9, [sp, #56]
-; CHECK-NEXT:    mov w9, #7 ; =0x7
-; CHECK-NEXT:    mov w8, #8 ; =0x8
-; CHECK-NEXT:    stp w8, w9, [sp, #48]
+; CHECK-NEXT:    mov w8, #1 ; =0x1
+; CHECK-NEXT:    mov w1, #2 ; =0x2
+; CHECK-NEXT:    stp w1, w8, [sp, #72]
+; CHECK-NEXT:    mov w2, #3 ; =0x3
+; CHECK-NEXT:    mov w3, #4 ; =0x4
+; CHECK-NEXT:    stp w3, w2, [sp, #64]
+; CHECK-NEXT:    mov w4, #5 ; =0x5
+; CHECK-NEXT:    mov w5, #6 ; =0x6
+; CHECK-NEXT:    stp w5, w4, [sp, #56]
+; CHECK-NEXT:    mov w6, #7 ; =0x7
+; CHECK-NEXT:    mov w7, #8 ; =0x8
+; CHECK-NEXT:    stp w7, w6, [sp, #48]
 ; CHECK-NEXT:    mov w8, #9 ; =0x9
 ; CHECK-NEXT:    mov w9, #10 ; =0xa
 ; CHECK-NEXT:    stp w9, w8, [sp, #40]
@@ -86,13 +86,6 @@ define i32 @main() nounwind ssp {
 ; CHECK-NEXT:    str x9, [sp, #8]
 ; CHECK-NEXT:    str w8, [sp]
 ; CHECK-NEXT:    add x0, sp, #76
-; CHECK-NEXT:    mov w1, #2 ; =0x2
-; CHECK-NEXT:    mov w2, #3 ; =0x3
-; CHECK-NEXT:    mov w3, #4 ; =0x4
-; CHECK-NEXT:    mov w4, #5 ; =0x5
-; CHECK-NEXT:    mov w5, #6 ; =0x6
-; CHECK-NEXT:    mov w6, #7 ; =0x7
-; CHECK-NEXT:    mov w7, #8 ; =0x8
 ; CHECK-NEXT:    bl _fn9
 ; CHECK-NEXT:    mov w0, #0 ; =0x0
 ; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir
new file mode 100644
index 0000000000000..303e25edb2b18
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-fpr.mir
@@ -0,0 +1,174 @@
+# RUN: llc -o - -mtriple=arm64-linux-gnu -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-LINUX
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=generic -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-APPLE
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=CPU
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-fpr64 -mattr=-zcm-fpr128 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTATTR
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mattr=+zcm-fpr64 -mattr=+zcm-fpr128 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=ATTR
+
+--- |
+  define void @remat_FPR128() {
+    ret void
+  }
+  declare void @foo_v4i32(<4 x float>, <4 x float>)
+
+  define void @remat_FPR64() {
+    ret void
+  }
+  declare void @foo_double(double, double)
+
+  define void @remat_FPR32() {
+    ret void
+  }
+  declare void @foo_float(float, float)
+
+  define void @remat_FPR16() {
+    ret void
+  }
+  declare void @foo_half(half, half)
+...
+---
+name:            remat_FPR128
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_FPR128
+
+    ; NOTCPU-LINUX: %0:fpr128 = MOVIv2d_ns 64
+    ; NOTCPU-LINUX-NEXT: %1:fpr128 = MOVIv2d_ns 64
+    ; NOTCPU-LINUX: BL @foo_v4i32
+
+    ; NOTCPU-APPLE: %0:fpr128 = MOVIv2d_ns 64
+    ; NOTCPU-APPLE-NEXT: %1:fpr128 = MOVIv2d_ns 64
+    ; NOTCPU-APPLE: BL @foo_v4i32
+
+    ; CPU: %0:fpr128 = MOVIv2d_ns 64
+    ; CPU-NEXT: %1:fpr128 = COPY %0
+    ; CPU: BL @foo_v4i32
+
+    ; NOTATTR: %0:fpr128 = MOVIv2d_ns 64
+    ; NOTATTR-NEXT: %1:fpr128 = MOVIv2d_ns 64
+    ; NOTATTR: BL @foo_v4i32
+
+    ; ATTR: %0:fpr128 = MOVIv2d_ns 64
+    ; ATTR-NEXT: %1:fpr128 = COPY %0
+    ; ATTR: BL @foo_v4i32
+
+    %0:fpr128 = MOVIv2d_ns 64
+    %1:fpr128 = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = ADDv4i32 %1, %1
+    
+    BL @foo_v4i32, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
+
+---
+name:            remat_FPR64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_FPR64
+
+    ; NOTCPU-LINUX: %0:fpr64 = FMOVDi 64
+    ; NOTCPU-LINUX-NEXT: %1:fpr64 = FMOVDi 64
+    ; NOTCPU-LINUX: BL @foo_double
+
+    ; NOTCPU-APPLE: %0:fpr64 = FMOVDi 64
+    ; NOTCPU-APPLE-NEXT: %1:fpr64 = FMOVDi 64
+    ; NOTCPU-APPLE: BL @foo_double
+
+    ; CPU: %0:fpr64 = FMOVDi 64
+    ; CPU-NEXT: %1:fpr64 = COPY %0
+    ; CPU: BL @foo_double
+
+    ; NOTATTR: %0:fpr64 = FMOVDi 64
+    ; NOTATTR-NEXT: %1:fpr64 = FMOVDi 64
+    ; NOTATTR: BL @foo_double
+
+    ; ATTR: %0:fpr64 = FMOVDi 64
+    ; ATTR-NEXT: %1:fpr64 = COPY %0
+    ; ATTR: BL @foo_double
+
+    %0:fpr64 = FMOVDi 64
+    %1:fpr64 = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = FADDDrr %1, %1, implicit $fpcr
+    
+    BL @foo_double, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
+
+---
+name:            remat_FPR32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_FPR32
+
+    ; NOTCPU-LINUX: %0:fpr32 = FMOVSi 64
+    ; NOTCPU-LINUX-NEXT: %1:fpr32 = FMOVSi 64
+    ; NOTCPU-LINUX: BL @foo_float
+
+    ; NOTCPU-APPLE: %0:fpr32 = FMOVSi 64
+    ; NOTCPU-APPLE-NEXT: %1:fpr32 = FMOVSi 64
+    ; NOTCPU-APPLE: BL @foo_float
+
+    ; CPU: %0:fpr32 = FMOVSi 64
+    ; CPU-NEXT: %1:fpr32 = COPY %0
+    ; CPU: BL @foo_float
+
+    ; NOTATTR: %0:fpr32 = FMOVSi 64
+    ; NOTATTR-NEXT: %1:fpr32 = FMOVSi 64
+    ; NOTATTR: BL @foo_float
+
+    ; ATTR: %0:fpr32 = FMOVSi 64
+    ; ATTR-NEXT: %1:fpr32 = COPY %0
+    ; ATTR: BL @foo_float
+
+    %0:fpr32 = FMOVSi 64
+    %1:fpr32 = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = FADDSrr %1, %1, implicit $fpcr
+    
+    BL @foo_float, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
+
+---
+name:            remat_FPR16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_FPR16
+
+    ; NOTCPU-LINUX: %0:fpr16 = FMOVHi 64
+    ; NOTCPU-LINUX-NEXT: %1:fpr16 = FMOVHi 64
+    ; NOTCPU-LINUX: BL @foo_half
+
+    ; NOTCPU-APPLE: %0:fpr16 = FMOVHi 64
+    ; NOTCPU-APPLE-NEXT: %1:fpr16 = FMOVHi 64
+    ; NOTCPU-APPLE: BL @foo_half
+
+    ; CPU: %0:fpr16 = FMOVHi 64
+    ; CPU-NEXT: %1:fpr16 = COPY %0
+    ; CPU: BL @foo_half
+
+    ; NOTATTR: %0:fpr16 = FMOVHi 64
+    ; NOTATTR-NEXT: %1:fpr16 = FMOVHi 64
+    ; NOTATTR: BL @foo_half
+
+    ; ATTR: %0:fpr16 = FMOVHi 64
+    ; ATTR-NEXT: %1:fpr16 = COPY %0
+    ; ATTR: BL @foo_half
+
+    %0:fpr16 = FMOVHi 64
+    %1:fpr16 = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = FADDHrr %1, %1, implicit $fpcr
+    
+    BL @foo_half, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir
new file mode 100644
index 0000000000000..6247572b2cf2c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-reg-coalesce-remat-zero-cycle-regmov-gpr.mir
@@ -0,0 +1,90 @@
+# RUN: llc -o - -mtriple=arm64-linux-gnu -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-LINUX
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=generic -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTCPU-APPLE
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=CPU
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mcpu=apple-m1 -mattr=-zcm-gpr64 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=NOTATTR
+# RUN: llc -o - -mtriple=arm64-apple-macosx -mattr=+zcm-gpr64 -run-pass=register-coalescer -verify-coalescing %s | FileCheck %s -check-prefixes=ATTR
+
+--- |
+  define void @remat_GPR32() {
+    ret void
+  }
+  declare void @foo_i32(i32, i32)
+
+  define void @remat_GPR64() {
+    ret void
+  }
+  declare void @foo_i64(i64, i64)
+...
+---
+name:            remat_GPR32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_GPR32
+
+    ; NOTCPU-LINUX: %0:gpr32 = MOVi32imm 32
+    ; NOTCPU-LINUX-NEXT: %1:gpr32common = MOVi32imm 32
+    ; NOTCPU-LINUX: BL @foo_i32
+
+    ; NOTCPU-APPLE: %0:gpr32 = MOVi32imm 32
+    ; NOTCPU-APPLE-NEXT: %1:gpr32common = MOVi32imm 32
+    ; NOTCPU-APPLE: BL @foo_i32
+
+    ; CPU: %0:gpr32 = MOVi32imm 32
+    ; CPU-NEXT: %1:gpr32sp = COPY %0
+    ; CPU: BL @foo_i32
+
+    ; NOTATTR: %0:gpr32 = MOVi32imm 32
+    ; NOTATTR-NEXT: %1:gpr32common = MOVi32imm 32
+    ; NOTATTR: BL @foo_i32
+
+    ; ATTR: %0:gpr32 = MOVi32imm 32
+    ; ATTR-NEXT: %1:gpr32sp = COPY %0
+    ; ATTR: BL @foo_i32
+
+    %0:gpr32 = MOVi32imm 32
+    %1:gpr32sp = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = ADDWri %1, 1, 0
+    
+    BL @foo_i32, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
+
+---
+name:            remat_GPR64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: remat_GPR64
+
+    ; NOTCPU-LINUX: %0:gpr64 = MOVi64imm 64
+    ; NOTCPU-LINUX-NEXT: %1:gpr64common = MOVi64imm 64
+    ; NOTCPU-LINUX: BL @foo_i64
+
+    ; NOTCPU-APPLE: %0:gpr64 = MOVi64imm 64
+    ; NOTCPU-APPLE-NEXT: %1:gpr64common = MOVi64imm 64
+    ; NOTCPU-APPLE: BL @foo_i64
+
+    ; CPU: %0:gpr64 = MOVi64imm 64
+    ; CPU-NEXT: %1:gpr64sp = COPY %0
+    ; CPU: BL @foo_i64
+
+    ; NOTATTR: %0:gpr64 = MOVi64imm 64
+    ; NOTATTR-NEXT: %1:gpr64common = MOVi64imm 64
+    ; NOTATTR: BL @foo_i64
+
+    ; ATTR: %0:gpr64 = MOVi64imm 64
+    ; ATTR-NEXT: %1:gpr64sp = COPY %0
+    ; ATTR: BL @foo_i64
+
+    %0:gpr64 = MOVi64imm 64
+    %1:gpr64sp = COPY %0
+
+    ; Creates a live range interference to prevent coalescing and force
+    ; trying to rematerialize the previous COPY.
+    %1 = ADDXri %1, 1, 0
+    
+    BL @foo_i64, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit %0, implicit %1
+    RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
index b225d9a1acaf5..3edec9c9d8fc6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -4,6 +4,7 @@ define <8 x i1> @test1() {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.16b v0, #0
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
   %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
@@ -58,9 +59,14 @@ bb:
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
 define <16 x i1> @test4(ptr %ptr, i32 %v) {
-; CHECK-LABEL: _test4:
-; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_0 at PAGE
-; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0 at PAGEOFF]
+; CHECK-LABEL: test4:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI3_0 at PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr q0, [x8, lCPI3_0 at PAGEOFF]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 bb:
   %Shuff = shufflevector <16 x i1> zeroinitializer,
      <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
diff --git a/llvm/test/CodeGen/AArch64/clear-dead-implicit-def-impdef.ll b/llvm/test/CodeGen/AArch64/clear-dead-implicit-def-impdef.ll
index bc26eca6f27ef..b5c5124b664a2 100644
--- a/llvm/test/CodeGen/AArch64/clear-dead-implicit-def-impdef.ll
+++ b/llvm/test/CodeGen/AArch64/clear-dead-implicit-def-impdef.ll
@@ -13,16 +13,17 @@ define void @_ZN38SanitizerCommonInterceptors_Scanf_Test8TestBodyEv(ptr %.str.40
 ; CHECK-NEXT:    stp x22, x21, [sp, #80] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] ; 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #112] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov x24, x6
-; CHECK-NEXT:    mov x19, x5
-; CHECK-NEXT:    mov x20, x4
-; CHECK-NEXT:    mov x21, x3
-; CHECK-NEXT:    mov x22, x2
-; CHECK-NEXT:    mov x23, x1
-; CHECK-NEXT:    mov x25, x0
+; CHECK-NEXT:    mov x25, x6
+; CHECK-NEXT:    str x5, [sp, #24] ; 8-byte Folded Spill
+; CHECK-NEXT:    mov x21, x4
+; CHECK-NEXT:    mov x22, x3
+; CHECK-NEXT:    mov x23, x2
+; CHECK-NEXT:    mov x24, x1
+; CHECK-NEXT:    mov x26, x0
 ; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov w19, #1 ; =0x1
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
-; CHECK-NEXT:    mov w1, #1 ; =0x1
+; CHECK-NEXT:    mov x1, x19
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
 ; CHECK-NEXT:    mov w28, #4 ; =0x4
 ; CHECK-NEXT:    stp x28, x28, [sp, #8]
@@ -34,59 +35,57 @@ define void @_ZN38SanitizerCommonInterceptors_Scanf_Test8TestBodyEv(ptr %.str.40
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
-; CHECK-NEXT:    mov w27, #8 ; =0x8
-; CHECK-NEXT:    str x27, [sp]
+; CHECK-NEXT:    mov w20, #8 ; =0x8
+; CHECK-NEXT:    str x20, [sp]
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
-; CHECK-NEXT:    mov w26, #1 ; =0x1
-; CHECK-NEXT:    stp xzr, x26, [sp]
+; CHECK-NEXT:    stp xzr, x19, [sp]
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
-; CHECK-NEXT:    str x26, [sp]
+; CHECK-NEXT:    str x19, [sp]
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
 ; CHECK-NEXT:    str x28, [sp]
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x26, _.str at GOTPAGE
+; CHECK-NEXT:    adrp x27, _.str at GOTPAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr x26, [x26, _.str at GOTPAGEOFF]
-; CHECK-NEXT:    mov x0, x26
+; CHECK-NEXT:    ldr x27, [x27, _.str at GOTPAGEOFF]
+; CHECK-NEXT:    mov x0, x27
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL9testScanfPKcjz
-; CHECK-NEXT:    str wzr, [x24]
-; CHECK-NEXT:    str x27, [sp]
-; CHECK-NEXT:    mov x0, x25
+; CHECK-NEXT:    str wzr, [x25]
+; CHECK-NEXT:    str x20, [sp]
+; CHECK-NEXT:    mov x0, x26
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str x28, [sp]
-; CHECK-NEXT:    mov x0, x23
+; CHECK-NEXT:    mov x0, x24
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str x28, [sp]
-; CHECK-NEXT:    mov x0, x22
+; CHECK-NEXT:    mov x0, x23
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str x28, [sp]
-; CHECK-NEXT:    mov x0, x21
+; CHECK-NEXT:    mov x0, x22
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str x28, [sp]
-; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x0, x21
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str xzr, [sp]
-; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
 ; CHECK-NEXT:    str xzr, [sp]
-; CHECK-NEXT:    mov x0, x26
+; CHECK-NEXT:    mov x0, x27
 ; CHECK-NEXT:    mov w1, #0 ; =0x0
 ; CHECK-NEXT:    bl __ZL20testScanfNoGnuMallocPKcjz
-; CHECK-NEXT:    mov w8, #1 ; =0x1
-; CHECK-NEXT:    stp x8, xzr, [sp, #8]
+; CHECK-NEXT:    stp x19, xzr, [sp, #8]
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    mov x0, #0 ; =0x0
 ; CHECK-NEXT:    mov w1, #0 ; =0x0