[llvm] [AArch64] Hint regalloc to choose distinct predicate for MATCH/CMP (PR #190139)

Thu Apr 2 02:37:20 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

<details>
<summary>Changes</summary>

For some cores it is preferable to choose a destination register that does not match the governing predicate for the CMP<> and MATCH instructions.

The hint is conservative in that it tries not to pick a callee-save register if it's not already used/allocated for other purposes, as that would introduce new spills/fills. Note that this might be preferable if e.g. the CMP/MATCH instruction is executed in a loop, but it might also be less preferable for small functions that have an SVE interface (p4-p15 are caller-preserved).

It is enabled for all cores by default, but it can be disabled by adding the `disable-distinct-dst-reg-cmp-match` feature. This feature can also be added to specific cores if this behaviour is undesirable.

---

Patch is 414.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190139.diff


66 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64Features.td (+10) 
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+159-3) 
- (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+10-10) 
- (modified) llvm/test/CodeGen/AArch64/combine-storetomstore.ll (+56-56) 
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll (+9-9) 
- (modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/extract-vector-elt-sve.ll (+12-12) 
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+3-3) 
- (modified) llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll (+14-14) 
- (modified) llvm/test/CodeGen/AArch64/rcpc3-sve.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-cmp-select.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-fcvt.ll (+8-8) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll (+33-33) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll (+30-30) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll (+44-44) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll (+40-40) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll (+11-11) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll (+8-8) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll (+50-50) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll (+58-58) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll (+64-64) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll (+51-51) 
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-insert-element.ll (+32-32) 
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-int-compares.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll (+8-8) 
- (modified) llvm/test/CodeGen/AArch64/sve-load-compare-store.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll (+23-23) 
- (added) llvm/test/CodeGen/AArch64/sve-match-cmp-predicate.ll (+205) 
- (modified) llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop.ll (+114-114) 
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll (+194-194) 
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll (+130-130) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll (+36-36) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-match.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-punpklo-combine.ll (+24-24) 
- (added) llvm/test/CodeGen/AArch64/sve-regalloc-hint-match-cmp.mir (+64) 
- (modified) llvm/test/CodeGen/AArch64/sve-scmp.ll (+14-14) 
- (modified) llvm/test/CodeGen/AArch64/sve-select.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-setcc.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll (+28-28) 
- (modified) llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll (+6-6) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+21-21) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+12-12) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll (+42-42) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+42-42) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll (+8-8) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+32-32) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+28-28) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+2-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll (+30-30) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll (+24-24) 
- (modified) llvm/test/CodeGen/AArch64/sve-trunc.ll (+4-4) 
- (modified) llvm/test/CodeGen/AArch64/sve-ucmp.ll (+14-14) 
- (modified) llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll (+28-28) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+14-14) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 4fbad6643d77a..7447fe7afbc68 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -911,6 +911,16 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
                                                "HasDisableFastIncVL", "true",
                                                "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
 
+// For some cores it is preferable to choose a destination register that does
+// not match the governing predicate for the CMP<> and MATCH instructions. When
+// enabled the register allocator tries to use a distinct register. While the
+// algorithm tries to avoid introducing callee-save spill/fills and therefore
+// shouldn't have any adverse affects on cores where using distinct registers
+// is not necessarily preferable, we still provide a way to turn the feature off.
+def FeatureDisableDistinctDstRegCmpMatch : SubtargetFeature<
+    "disable-distinct-dst-reg-cmp-match", "UseDistinctDstRegCmpMatch",
+    "false", "Use distinct destination register for CMP/MATCH instructions">;
+
 // On most processors we want to avoid moving from WZR to vector registers
 // (relying on materializing 0 to a FPR and moving from there instead),
 // but on some (in-order) cores it's preferable to avoid the extra instruction instead.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 8c0dd4381fae8..0518bfb0cd44f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1114,6 +1114,159 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
+static bool requiresMatchCmpRegallocHint(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  // All match instructions
+  case AArch64::MATCH_PPzZZ_B:
+  case AArch64::MATCH_PPzZZ_H:
+  // Vector compare instructions (CMPL* are aliases of CMPG*/H*)
+  case AArch64::CMPEQ_PPzZZ_B:
+  case AArch64::CMPEQ_PPzZZ_H:
+  case AArch64::CMPEQ_PPzZZ_S:
+  case AArch64::CMPEQ_PPzZZ_D:
+  case AArch64::CMPNE_PPzZZ_B:
+  case AArch64::CMPNE_PPzZZ_H:
+  case AArch64::CMPNE_PPzZZ_S:
+  case AArch64::CMPNE_PPzZZ_D:
+  case AArch64::CMPGE_PPzZZ_B:
+  case AArch64::CMPGE_PPzZZ_H:
+  case AArch64::CMPGE_PPzZZ_S:
+  case AArch64::CMPGE_PPzZZ_D:
+  case AArch64::CMPHS_PPzZZ_B:
+  case AArch64::CMPHS_PPzZZ_H:
+  case AArch64::CMPHS_PPzZZ_S:
+  case AArch64::CMPHS_PPzZZ_D:
+  case AArch64::CMPGT_PPzZZ_B:
+  case AArch64::CMPGT_PPzZZ_H:
+  case AArch64::CMPGT_PPzZZ_S:
+  case AArch64::CMPGT_PPzZZ_D:
+  case AArch64::CMPHI_PPzZZ_B:
+  case AArch64::CMPHI_PPzZZ_H:
+  case AArch64::CMPHI_PPzZZ_S:
+  case AArch64::CMPHI_PPzZZ_D:
+  // Vector/immediate compare instructions
+  case AArch64::CMPEQ_PPzZI_B:
+  case AArch64::CMPEQ_PPzZI_H:
+  case AArch64::CMPEQ_PPzZI_S:
+  case AArch64::CMPEQ_PPzZI_D:
+  case AArch64::CMPNE_PPzZI_B:
+  case AArch64::CMPNE_PPzZI_H:
+  case AArch64::CMPNE_PPzZI_S:
+  case AArch64::CMPNE_PPzZI_D:
+  case AArch64::CMPGE_PPzZI_B:
+  case AArch64::CMPGE_PPzZI_H:
+  case AArch64::CMPGE_PPzZI_S:
+  case AArch64::CMPGE_PPzZI_D:
+  case AArch64::CMPHS_PPzZI_B:
+  case AArch64::CMPHS_PPzZI_H:
+  case AArch64::CMPHS_PPzZI_S:
+  case AArch64::CMPHS_PPzZI_D:
+  case AArch64::CMPGT_PPzZI_B:
+  case AArch64::CMPGT_PPzZI_H:
+  case AArch64::CMPGT_PPzZI_S:
+  case AArch64::CMPGT_PPzZI_D:
+  case AArch64::CMPHI_PPzZI_B:
+  case AArch64::CMPHI_PPzZI_H:
+  case AArch64::CMPHI_PPzZI_S:
+  case AArch64::CMPHI_PPzZI_D:
+  case AArch64::CMPLE_PPzZI_B:
+  case AArch64::CMPLE_PPzZI_H:
+  case AArch64::CMPLE_PPzZI_S:
+  case AArch64::CMPLE_PPzZI_D:
+  case AArch64::CMPLS_PPzZI_B:
+  case AArch64::CMPLS_PPzZI_H:
+  case AArch64::CMPLS_PPzZI_S:
+  case AArch64::CMPLS_PPzZI_D:
+  case AArch64::CMPLT_PPzZI_B:
+  case AArch64::CMPLT_PPzZI_H:
+  case AArch64::CMPLT_PPzZI_S:
+  case AArch64::CMPLT_PPzZI_D:
+  case AArch64::CMPLO_PPzZI_B:
+  case AArch64::CMPLO_PPzZI_H:
+  case AArch64::CMPLO_PPzZI_S:
+  case AArch64::CMPLO_PPzZI_D:
+  // Wide-vector compare instructions
+  case AArch64::CMPEQ_WIDE_PPzZZ_B:
+  case AArch64::CMPEQ_WIDE_PPzZZ_H:
+  case AArch64::CMPEQ_WIDE_PPzZZ_S:
+  case AArch64::CMPNE_WIDE_PPzZZ_B:
+  case AArch64::CMPNE_WIDE_PPzZZ_H:
+  case AArch64::CMPNE_WIDE_PPzZZ_S:
+  case AArch64::CMPGE_WIDE_PPzZZ_B:
+  case AArch64::CMPGE_WIDE_PPzZZ_H:
+  case AArch64::CMPGE_WIDE_PPzZZ_S:
+  case AArch64::CMPHS_WIDE_PPzZZ_B:
+  case AArch64::CMPHS_WIDE_PPzZZ_H:
+  case AArch64::CMPHS_WIDE_PPzZZ_S:
+  case AArch64::CMPGT_WIDE_PPzZZ_B:
+  case AArch64::CMPGT_WIDE_PPzZZ_H:
+  case AArch64::CMPGT_WIDE_PPzZZ_S:
+  case AArch64::CMPHI_WIDE_PPzZZ_B:
+  case AArch64::CMPHI_WIDE_PPzZZ_H:
+  case AArch64::CMPHI_WIDE_PPzZZ_S:
+  case AArch64::CMPLE_WIDE_PPzZZ_B:
+  case AArch64::CMPLE_WIDE_PPzZZ_H:
+  case AArch64::CMPLE_WIDE_PPzZZ_S:
+  case AArch64::CMPLS_WIDE_PPzZZ_B:
+  case AArch64::CMPLS_WIDE_PPzZZ_H:
+  case AArch64::CMPLS_WIDE_PPzZZ_S:
+  case AArch64::CMPLT_WIDE_PPzZZ_B:
+  case AArch64::CMPLT_WIDE_PPzZZ_H:
+  case AArch64::CMPLT_WIDE_PPzZZ_S:
+  case AArch64::CMPLO_WIDE_PPzZZ_B:
+  case AArch64::CMPLO_WIDE_PPzZZ_H:
+  case AArch64::CMPLO_WIDE_PPzZZ_S:
+    return true;
+  }
+}
+
+static bool HandleMatchCmpPredicateHint(Register VirtReg,
+                                        ArrayRef<MCPhysReg> Order,
+                                        SmallVectorImpl<MCPhysReg> &Hints,
+                                        const VirtRegMap *VRM,
+                                        const MachineRegisterInfo &MRI,
+                                        const AArch64Subtarget &ST,
+                                        const LiveRegMatrix *Matrix) {
+  const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
+  if (!ST.useDistinctDstRegCmpMatch() ||
+      !AArch64::PPRRegClass.hasSubClassEq(RegRC) || !MRI.hasOneDef(VirtReg) ||
+      Order.size() < 2)
+    return false;
+
+  const MachineInstr *DefInst = MRI.getOneDef(VirtReg)->getParent();
+  if (!requiresMatchCmpRegallocHint(DefInst->getOpcode()))
+    return false;
+
+  Register Op1Reg = DefInst->getOperand(1).getReg();
+  if (Op1Reg.isVirtual())
+    Op1Reg = VRM->getPhys(Op1Reg);
+
+  // If no register is allocated for the general-predicate, it's not yet
+  // possible to choose a distinct register.
+  if (!Op1Reg.isValid())
+    return false;
+
+  // Move Op1Reg as the least preferred register.
+  //
+  // This might result in callee-save spills when the function takes/returns
+  // arguments in SVE registers (i.e. needs to preserve p4-p15) and can't reuse
+  // p0-p3. That's why we limit it to non-callee saved registers or to
+  // callee-saved registers that have already been allocated for other uses in
+  // the function.
+  DenseSet<unsigned> CSRs;
+  for (unsigned I = 0; MRI.getCalleeSavedRegs()[I]; ++I)
+    CSRs.insert(MRI.getCalleeSavedRegs()[I]);
+
+  Hints.append(Order.begin(), Order.end());
+  llvm::stable_sort(Hints, [&](Register A, Register B) {
+    return B == Op1Reg &&
+           (!CSRs.contains(A) || !MRI.def_empty(A) || Matrix->isPhysRegUsed(A));
+  });
+  return true;
+}
+
 // We add regalloc hints for different cases:
 // * Choosing a better destination operand for predicated SVE instructions
 //   where the inactive lanes are undef, by choosing a register that is not
@@ -1143,14 +1296,14 @@ bool AArch64RegisterInfo::getRegAllocationHints(
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  bool ConsiderOnlyHints =
+      TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+
   // For predicated SVE instructions where the inactive lanes are undef,
   // pick a destination register that is not unique to avoid introducing
   // a movprfx.
   const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
   if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
-    bool ConsiderOnlyHints = TargetRegisterInfo::getRegAllocationHints(
-        VirtReg, Order, Hints, MF, VRM);
-
     for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
       const MachineInstr &Def = *DefOp.getParent();
       if (DefOp.isImplicit() ||
@@ -1200,6 +1353,9 @@ bool AArch64RegisterInfo::getRegAllocationHints(
       return ConsiderOnlyHints;
   }
 
+  if (HandleMatchCmpPredicateHint(VirtReg, Order, Hints, VRM, MRI, ST, Matrix))
+    return ConsiderOnlyHints;
+
   if (!ST.hasSME() || !ST.isStreaming())
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 05d083a654cf6..778be79038a78 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -183,8 +183,8 @@ define <vscale x 1 x i1> @lane_mask_nxv1i1_i32(i32 %index, i32 %TC) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uqadd z0.s, z0.s, z1.s
 ; CHECK-NEXT:    mov z1.s, w1
-; CHECK-NEXT:    cmphi p0.s, p0/z, z1.s, z0.s
-; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    cmphi p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    punpklo p0.h, p1.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 %index, i32 %TC)
@@ -303,8 +303,8 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
 ; CHECK-STREAMING-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-STREAMING-NEXT:    mov z1.b, w1
-; CHECK-STREAMING-NEXT:    cmphi p0.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z1.b, z0.b
+; CHECK-STREAMING-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    ret
   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i8(i8 %index, i8 %TC)
   ret <16 x i1> %active.lane.mask
@@ -329,8 +329,8 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
 ; CHECK-STREAMING-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-STREAMING-NEXT:    mov z1.b, w1
-; CHECK-STREAMING-NEXT:    cmphi p0.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    cmphi p1.b, p0/z, z1.b, z0.b
+; CHECK-STREAMING-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    ret
   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC)
   ret <8 x i1> %active.lane.mask
@@ -360,8 +360,8 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 ; CHECK-STREAMING-NEXT:    mov z1.h, w1
 ; CHECK-STREAMING-NEXT:    umin z0.h, z0.h, #255
 ; CHECK-STREAMING-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-STREAMING-NEXT:    cmphi p0.h, p0/z, z1.h, z0.h
-; CHECK-STREAMING-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    cmphi p1.h, p0/z, z1.h, z0.h
+; CHECK-STREAMING-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
   ret <4 x i1> %active.lane.mask
@@ -389,8 +389,8 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-STREAMING-NEXT:    and w8, w1, #0xff
 ; CHECK-STREAMING-NEXT:    mov z1.s, w8
 ; CHECK-STREAMING-NEXT:    umin z0.s, z0.s, #255
-; CHECK-STREAMING-NEXT:    cmphi p0.s, p0/z, z1.s, z0.s
-; CHECK-STREAMING-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT:    cmphi p1.s, p0/z, z1.s, z0.s
+; CHECK-STREAMING-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-STREAMING-NEXT:    ret
   %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
   ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
index 1e4a695d1f4e8..d4244fedf7504 100644
--- a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
@@ -24,8 +24,8 @@ define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    shl v1.4h, v1.4h, #15
 ; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT:    st1h { z0.h }, p0, [x0]
+; SVE-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT:    st1h { z0.h }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x i16>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load
@@ -40,8 +40,8 @@ define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
-; SVE-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE-NEXT:    cmpne p1.s, p0/z, z1.s, #0
+; SVE-NEXT:    st1w { z0.s }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x i32>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load
@@ -62,9 +62,9 @@ define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %m
 ; SVE-NEXT:    shl v3.2d, v3.2d, #63
 ; SVE-NEXT:    shl v2.2d, v2.2d, #63
 ; SVE-NEXT:    cmpne p1.d, p0/z, z3.d, #0
-; SVE-NEXT:    cmpne p0.d, p0/z, z2.d, #0
+; SVE-NEXT:    cmpne p2.d, p0/z, z2.d, #0
 ; SVE-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
-; SVE-NEXT:    st1d { z0.d }, p0, [x0]
+; SVE-NEXT:    st1d { z0.d }, p2, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x i64>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load
@@ -78,8 +78,8 @@ define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %
 ; SVE-NEXT:    shl v1.4h, v1.4h, #15
 ; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT:    st1h { z0.h }, p0, [x0]
+; SVE-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT:    st1h { z0.h }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x half>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load
@@ -94,8 +94,8 @@ define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1>
 ; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.4s, v1.4s, #31
-; SVE-NEXT:    cmpne p0.s, p0/z, z1.s, #0
-; SVE-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE-NEXT:    cmpne p1.s, p0/z, z1.s, #0
+; SVE-NEXT:    st1w { z0.s }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x float>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load
@@ -116,9 +116,9 @@ define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1>
 ; SVE-NEXT:    shl v3.2d, v3.2d, #63
 ; SVE-NEXT:    shl v2.2d, v2.2d, #63
 ; SVE-NEXT:    cmpne p1.d, p0/z, z3.d, #0
-; SVE-NEXT:    cmpne p0.d, p0/z, z2.d, #0
+; SVE-NEXT:    cmpne p2.d, p0/z, z2.d, #0
 ; SVE-NEXT:    st1d { z1.d }, p1, [x0, x8, lsl #3]
-; SVE-NEXT:    st1d { z0.d }, p0, [x0]
+; SVE-NEXT:    st1d { z0.d }, p2, [x0]
 ; SVE-NEXT:    ret
   %load = load <4 x double>, ptr %ptr, align 32
   %sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load
@@ -132,8 +132,8 @@ define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mas
 ; SVE-NEXT:    shl v1.8b, v1.8b, #7
 ; SVE-NEXT:    ptrue p0.b, vl8
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; SVE-NEXT:    st1b { z0.b }, p0, [x0]
+; SVE-NEXT:    cmpne p1.b, p0/z, z1.b, #0
+; SVE-NEXT:    st1b { z0.b }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x i8>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load
@@ -148,8 +148,8 @@ define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.8h, v1.8h, #15
-; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT:    st1h { z0.h }, p0, [x0]
+; SVE-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT:    st1h { z0.h }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x i16>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load
@@ -171,9 +171,9 @@ define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    shl v3.4s, v3.4s, #31
 ; SVE-NEXT:    shl v2.4s, v2.4s, #31
 ; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
-; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
+; SVE-NEXT:    cmpne p2.s, p0/z, z2.s, #0
 ; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
-; SVE-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE-NEXT:    st1w { z0.s }, p2, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x i32>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
@@ -211,12 +211,12 @@ define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %m
 ; SVE-NEXT:    shl v4.2d, v7.2d, #63
 ; SVE-NEXT:    cmpne p2.d, p0/z, z5.d, #0
 ; SVE-NEXT:    cmpne p3.d, p0/z, z6.d, #0
-; SVE-NEXT:    cmpne p0.d, p0/z, z4.d, #0
+; SVE-NEXT:    cmpne p4.d, p0/z, z4.d, #0
 ; SVE-NEXT:    st1d { z2.d }, p1, [x0, x8, lsl #3]
 ; SVE-NEXT:    mov x8, #2 // =0x2
 ; SVE-NEXT:    st1d { z3.d }, p2, [x0, x9, lsl #3]
 ; SVE-NEXT:    st1d { z1.d }, p3, [x0, x8, lsl #3]
-; SVE-NEXT:    st1d { z0.d }, p0, [x0]
+; SVE-NEXT:    st1d { z0.d }, p4, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x i64>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load
@@ -231,8 +231,8 @@ define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %
 ; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; SVE-NEXT:    shl v1.8h, v1.8h, #15
-; SVE-NEXT:    cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT:    st1h { z0.h }, p0, [x0]
+; SVE-NEXT:    cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT:    st1h { z0.h }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x half>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load
@@ -254,9 +254,9 @@ define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1>
 ; SVE-NEXT:    shl v3.4s, v3.4s, #31
 ; SVE-NEXT:    shl v2.4s, v2.4s, #31
 ; SVE-NEXT:    cmpne p1.s, p0/z, z3.s, #0
-; SVE-NEXT:    cmpne p0.s, p0/z, z2.s, #0
+; SVE-NEXT:    cmpne p2.s, p0/z, z2.s, #0
 ; SVE-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
-; SVE-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE-NEXT:    st1w { z0.s }, p2, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x float>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load
@@ -294,12 +294,12 @@ define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1>
 ; SVE-NEXT:    shl v4.2d, v7.2d, #63
 ; SVE-NEXT:    cmpne p2.d, p0/z, z5.d, #0
 ; SVE-NEXT:    cmpne p3.d, p0/z, z6.d, #0
-; SVE-NEXT:    cmpne p0.d, p0/z, z4.d, #0
+; SVE-NEXT:    cmpne p4.d, p0/z, z4.d, #0
 ; SVE-NEXT:    st1d { z2.d }, p1, [x0, x8, lsl #3]
 ; SVE-NEXT:    mov x8, #2 // =0x2
 ; SVE-NEXT:    st1d { z3.d }, p2, [x0, x9, lsl #3]
 ; SVE-NEXT:    st1d { z1.d }, p3, [x0, x8, lsl #3]
-; SVE-NEXT:    st1d { z0.d }, p0, [x0]
+; SVE-NEXT:    st1d { z0.d }, p4, [x0]
 ; SVE-NEXT:    ret
   %load = load <8 x double>, ptr %ptr, align 32
   %sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load
@@ -313,8 +313,8 @@ define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %
 ; SVE-NEXT:    shl v1.16b, v1.16b, #7
 ; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT:    cmpne p0.b, p0/z, z1.b, #0
-; SVE-NEXT:    st1b { z0.b }, p0, [x0]
+; SVE-NEXT:    cmpne p1.b, p0/z, z1.b, #0
+; SVE-NEXT:    st1b { z0.b }, p1, [x0]
 ; SVE-NEXT:    ret
   %load = load <16 x i8>, ptr %ptr, align 32
   %sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load
@@ -334,9 +334,9 @@ define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1>
 ; SVE-NEXT:    shl v3.8h, v3.8h, #15
 ; SVE-NEXT:    shl v2.8h, v2.8h, #15
 ; SVE-NEXT:    cmpne p1.h, p0/z, z3.h, #0
-; SVE-NEXT:    cmpne p0.h, p0/z, z2.h, #0
+; SVE-NEXT: ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/190139