[llvm] [AArch64] Hint regalloc to choose distinct predicate for MATCH/CMP (PR #190139)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 02:37:20 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Sander de Smalen (sdesmalen-arm)
<details>
<summary>Changes</summary>
For some cores it is preferable to choose a destination register that does not match the governing predicate for the CMP<> and MATCH instructions.
The hint is conservative in that it tries not to pick a callee-save register if it's not already used/allocated for other purposes, as that would introduce new spills/fills. Note that this might be preferable if e.g. the CMP/MATCH instruction is executed in a loop, but it might also be less preferable for small functions that have an SVE interface (p4-p15 are caller-preserved).
It is enabled for all cores by default, but it can be disabled by adding the `disable-distinct-dst-reg-cmp-match` feature. This feature can also be added to specific cores if this behaviour is undesirable.
---
Patch is 414.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190139.diff
66 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64Features.td (+10)
- (modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+159-3)
- (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+10-10)
- (modified) llvm/test/CodeGen/AArch64/combine-storetomstore.ll (+56-56)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll (+9-9)
- (modified) llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/extract-vector-elt-sve.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/intrinsic-vector-match-sve2.ll (+14-14)
- (modified) llvm/test/CodeGen/AArch64/rcpc3-sve.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-cmp-select.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-fcvt.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll (+33-33)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll (+30-30)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll (+44-44)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll (+40-40)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll (+11-11)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll (+50-50)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll (+58-58)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll (+64-64)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll (+51-51)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-insert-element.ll (+32-32)
- (modified) llvm/test/CodeGen/AArch64/sve-intrinsics-int-compares.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve-load-compare-store.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll (+23-23)
- (added) llvm/test/CodeGen/AArch64/sve-match-cmp-predicate.ll (+205)
- (modified) llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop.ll (+114-114)
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll (+194-194)
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll (+130-130)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll (+36-36)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-ptest-removal-match.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-punpklo-combine.ll (+24-24)
- (added) llvm/test/CodeGen/AArch64/sve-regalloc-hint-match-cmp.mir (+64)
- (modified) llvm/test/CodeGen/AArch64/sve-scmp.ll (+14-14)
- (modified) llvm/test/CodeGen/AArch64/sve-select.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-setcc.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-smulo-sdnode.ll (+28-28)
- (modified) llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll (+6-6)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+21-21)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll (+42-42)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+42-42)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+32-32)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+28-28)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+2-2)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll (+30-30)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll (+24-24)
- (modified) llvm/test/CodeGen/AArch64/sve-trunc.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-ucmp.ll (+14-14)
- (modified) llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll (+28-28)
- (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+14-14)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 4fbad6643d77a..7447fe7afbc68 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -911,6 +911,16 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl",
"HasDisableFastIncVL", "true",
"Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">;
+// For some cores it is preferable to choose a destination register that does
+// not match the governing predicate for the CMP<> and MATCH instructions. When
+// enabled the register allocator tries to use a distinct register. While the
+// algorithm tries to avoid introducing callee-save spill/fills and therefore
+// shouldn't have any adverse affects on cores where using distinct registers
+// is not necessarily preferable, we still provide a way to turn the feature off.
+def FeatureDisableDistinctDstRegCmpMatch : SubtargetFeature<
+ "disable-distinct-dst-reg-cmp-match", "UseDistinctDstRegCmpMatch",
+ "false", "Use distinct destination register for CMP/MATCH instructions">;
+
// On most processors we want to avoid moving from WZR to vector registers
// (relying on materializing 0 to a FPR and moving from there instead),
// but on some (in-order) cores it's preferable to avoid the extra instruction instead.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 8c0dd4381fae8..0518bfb0cd44f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1114,6 +1114,159 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
}
+static bool requiresMatchCmpRegallocHint(unsigned Opc) {
+ switch (Opc) {
+ default:
+ return false;
+ // All match instructions
+ case AArch64::MATCH_PPzZZ_B:
+ case AArch64::MATCH_PPzZZ_H:
+ // Vector compare instructions (CMPL* are aliases of CMPG*/H*)
+ case AArch64::CMPEQ_PPzZZ_B:
+ case AArch64::CMPEQ_PPzZZ_H:
+ case AArch64::CMPEQ_PPzZZ_S:
+ case AArch64::CMPEQ_PPzZZ_D:
+ case AArch64::CMPNE_PPzZZ_B:
+ case AArch64::CMPNE_PPzZZ_H:
+ case AArch64::CMPNE_PPzZZ_S:
+ case AArch64::CMPNE_PPzZZ_D:
+ case AArch64::CMPGE_PPzZZ_B:
+ case AArch64::CMPGE_PPzZZ_H:
+ case AArch64::CMPGE_PPzZZ_S:
+ case AArch64::CMPGE_PPzZZ_D:
+ case AArch64::CMPHS_PPzZZ_B:
+ case AArch64::CMPHS_PPzZZ_H:
+ case AArch64::CMPHS_PPzZZ_S:
+ case AArch64::CMPHS_PPzZZ_D:
+ case AArch64::CMPGT_PPzZZ_B:
+ case AArch64::CMPGT_PPzZZ_H:
+ case AArch64::CMPGT_PPzZZ_S:
+ case AArch64::CMPGT_PPzZZ_D:
+ case AArch64::CMPHI_PPzZZ_B:
+ case AArch64::CMPHI_PPzZZ_H:
+ case AArch64::CMPHI_PPzZZ_S:
+ case AArch64::CMPHI_PPzZZ_D:
+ // Vector/immediate compare instructions
+ case AArch64::CMPEQ_PPzZI_B:
+ case AArch64::CMPEQ_PPzZI_H:
+ case AArch64::CMPEQ_PPzZI_S:
+ case AArch64::CMPEQ_PPzZI_D:
+ case AArch64::CMPNE_PPzZI_B:
+ case AArch64::CMPNE_PPzZI_H:
+ case AArch64::CMPNE_PPzZI_S:
+ case AArch64::CMPNE_PPzZI_D:
+ case AArch64::CMPGE_PPzZI_B:
+ case AArch64::CMPGE_PPzZI_H:
+ case AArch64::CMPGE_PPzZI_S:
+ case AArch64::CMPGE_PPzZI_D:
+ case AArch64::CMPHS_PPzZI_B:
+ case AArch64::CMPHS_PPzZI_H:
+ case AArch64::CMPHS_PPzZI_S:
+ case AArch64::CMPHS_PPzZI_D:
+ case AArch64::CMPGT_PPzZI_B:
+ case AArch64::CMPGT_PPzZI_H:
+ case AArch64::CMPGT_PPzZI_S:
+ case AArch64::CMPGT_PPzZI_D:
+ case AArch64::CMPHI_PPzZI_B:
+ case AArch64::CMPHI_PPzZI_H:
+ case AArch64::CMPHI_PPzZI_S:
+ case AArch64::CMPHI_PPzZI_D:
+ case AArch64::CMPLE_PPzZI_B:
+ case AArch64::CMPLE_PPzZI_H:
+ case AArch64::CMPLE_PPzZI_S:
+ case AArch64::CMPLE_PPzZI_D:
+ case AArch64::CMPLS_PPzZI_B:
+ case AArch64::CMPLS_PPzZI_H:
+ case AArch64::CMPLS_PPzZI_S:
+ case AArch64::CMPLS_PPzZI_D:
+ case AArch64::CMPLT_PPzZI_B:
+ case AArch64::CMPLT_PPzZI_H:
+ case AArch64::CMPLT_PPzZI_S:
+ case AArch64::CMPLT_PPzZI_D:
+ case AArch64::CMPLO_PPzZI_B:
+ case AArch64::CMPLO_PPzZI_H:
+ case AArch64::CMPLO_PPzZI_S:
+ case AArch64::CMPLO_PPzZI_D:
+ // Wide-vector compare instructions
+ case AArch64::CMPEQ_WIDE_PPzZZ_B:
+ case AArch64::CMPEQ_WIDE_PPzZZ_H:
+ case AArch64::CMPEQ_WIDE_PPzZZ_S:
+ case AArch64::CMPNE_WIDE_PPzZZ_B:
+ case AArch64::CMPNE_WIDE_PPzZZ_H:
+ case AArch64::CMPNE_WIDE_PPzZZ_S:
+ case AArch64::CMPGE_WIDE_PPzZZ_B:
+ case AArch64::CMPGE_WIDE_PPzZZ_H:
+ case AArch64::CMPGE_WIDE_PPzZZ_S:
+ case AArch64::CMPHS_WIDE_PPzZZ_B:
+ case AArch64::CMPHS_WIDE_PPzZZ_H:
+ case AArch64::CMPHS_WIDE_PPzZZ_S:
+ case AArch64::CMPGT_WIDE_PPzZZ_B:
+ case AArch64::CMPGT_WIDE_PPzZZ_H:
+ case AArch64::CMPGT_WIDE_PPzZZ_S:
+ case AArch64::CMPHI_WIDE_PPzZZ_B:
+ case AArch64::CMPHI_WIDE_PPzZZ_H:
+ case AArch64::CMPHI_WIDE_PPzZZ_S:
+ case AArch64::CMPLE_WIDE_PPzZZ_B:
+ case AArch64::CMPLE_WIDE_PPzZZ_H:
+ case AArch64::CMPLE_WIDE_PPzZZ_S:
+ case AArch64::CMPLS_WIDE_PPzZZ_B:
+ case AArch64::CMPLS_WIDE_PPzZZ_H:
+ case AArch64::CMPLS_WIDE_PPzZZ_S:
+ case AArch64::CMPLT_WIDE_PPzZZ_B:
+ case AArch64::CMPLT_WIDE_PPzZZ_H:
+ case AArch64::CMPLT_WIDE_PPzZZ_S:
+ case AArch64::CMPLO_WIDE_PPzZZ_B:
+ case AArch64::CMPLO_WIDE_PPzZZ_H:
+ case AArch64::CMPLO_WIDE_PPzZZ_S:
+ return true;
+ }
+}
+
+static bool HandleMatchCmpPredicateHint(Register VirtReg,
+ ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints,
+ const VirtRegMap *VRM,
+ const MachineRegisterInfo &MRI,
+ const AArch64Subtarget &ST,
+ const LiveRegMatrix *Matrix) {
+ const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
+ if (!ST.useDistinctDstRegCmpMatch() ||
+ !AArch64::PPRRegClass.hasSubClassEq(RegRC) || !MRI.hasOneDef(VirtReg) ||
+ Order.size() < 2)
+ return false;
+
+ const MachineInstr *DefInst = MRI.getOneDef(VirtReg)->getParent();
+ if (!requiresMatchCmpRegallocHint(DefInst->getOpcode()))
+ return false;
+
+ Register Op1Reg = DefInst->getOperand(1).getReg();
+ if (Op1Reg.isVirtual())
+ Op1Reg = VRM->getPhys(Op1Reg);
+
+ // If no register is allocated for the general-predicate, it's not yet
+ // possible to choose a distinct register.
+ if (!Op1Reg.isValid())
+ return false;
+
+ // Move Op1Reg as the least preferred register.
+ //
+ // This might result in callee-save spills when the function takes/returns
+ // arguments in SVE registers (i.e. needs to preserve p4-p15) and can't reuse
+ // p0-p3. That's why we limit it to non-callee saved registers or to
+ // callee-saved registers that have already been allocated for other uses in
+ // the function.
+ DenseSet<unsigned> CSRs;
+ for (unsigned I = 0; MRI.getCalleeSavedRegs()[I]; ++I)
+ CSRs.insert(MRI.getCalleeSavedRegs()[I]);
+
+ Hints.append(Order.begin(), Order.end());
+ llvm::stable_sort(Hints, [&](Register A, Register B) {
+ return B == Op1Reg &&
+ (!CSRs.contains(A) || !MRI.def_empty(A) || Matrix->isPhysRegUsed(A));
+ });
+ return true;
+}
+
// We add regalloc hints for different cases:
// * Choosing a better destination operand for predicated SVE instructions
// where the inactive lanes are undef, by choosing a register that is not
@@ -1143,14 +1296,14 @@ bool AArch64RegisterInfo::getRegAllocationHints(
MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
+ bool ConsiderOnlyHints =
+ TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+
// For predicated SVE instructions where the inactive lanes are undef,
// pick a destination register that is not unique to avoid introducing
// a movprfx.
const TargetRegisterClass *RegRC = MRI.getRegClass(VirtReg);
if (AArch64::ZPRRegClass.hasSubClassEq(RegRC)) {
- bool ConsiderOnlyHints = TargetRegisterInfo::getRegAllocationHints(
- VirtReg, Order, Hints, MF, VRM);
-
for (const MachineOperand &DefOp : MRI.def_operands(VirtReg)) {
const MachineInstr &Def = *DefOp.getParent();
if (DefOp.isImplicit() ||
@@ -1200,6 +1353,9 @@ bool AArch64RegisterInfo::getRegAllocationHints(
return ConsiderOnlyHints;
}
+ if (HandleMatchCmpPredicateHint(VirtReg, Order, Hints, VRM, MRI, ST, Matrix))
+ return ConsiderOnlyHints;
+
if (!ST.hasSME() || !ST.isStreaming())
return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
VRM);
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index 05d083a654cf6..778be79038a78 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -183,8 +183,8 @@ define <vscale x 1 x i1> @lane_mask_nxv1i1_i32(i32 %index, i32 %TC) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: uqadd z0.s, z0.s, z1.s
; CHECK-NEXT: mov z1.s, w1
-; CHECK-NEXT: cmphi p0.s, p0/z, z1.s, z0.s
-; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: cmphi p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT: punpklo p0.h, p1.b
; CHECK-NEXT: punpklo p0.h, p0.b
; CHECK-NEXT: ret
%active.lane.mask = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 %index, i32 %TC)
@@ -303,8 +303,8 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) {
; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d
; CHECK-STREAMING-NEXT: mov z1.b, w1
-; CHECK-STREAMING-NEXT: cmphi p0.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b
+; CHECK-STREAMING-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: ret
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i8(i8 %index, i8 %TC)
ret <16 x i1> %active.lane.mask
@@ -329,8 +329,8 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) {
; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d
; CHECK-STREAMING-NEXT: mov z1.b, w1
-; CHECK-STREAMING-NEXT: cmphi p0.b, p0/z, z1.b, z0.b
-; CHECK-STREAMING-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b
+; CHECK-STREAMING-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: ret
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i8(i8 %index, i8 %TC)
ret <8 x i1> %active.lane.mask
@@ -360,8 +360,8 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
; CHECK-STREAMING-NEXT: mov z1.h, w1
; CHECK-STREAMING-NEXT: umin z0.h, z0.h, #255
; CHECK-STREAMING-NEXT: and z1.h, z1.h, #0xff
-; CHECK-STREAMING-NEXT: cmphi p0.h, p0/z, z1.h, z0.h
-; CHECK-STREAMING-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: cmphi p1.h, p0/z, z1.h, z0.h
+; CHECK-STREAMING-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: ret
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
ret <4 x i1> %active.lane.mask
@@ -389,8 +389,8 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
; CHECK-STREAMING-NEXT: and w8, w1, #0xff
; CHECK-STREAMING-NEXT: mov z1.s, w8
; CHECK-STREAMING-NEXT: umin z0.s, z0.s, #255
-; CHECK-STREAMING-NEXT: cmphi p0.s, p0/z, z1.s, z0.s
-; CHECK-STREAMING-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-STREAMING-NEXT: cmphi p1.s, p0/z, z1.s, z0.s
+; CHECK-STREAMING-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff
; CHECK-STREAMING-NEXT: ret
%active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
index 1e4a695d1f4e8..d4244fedf7504 100644
--- a/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
+++ b/llvm/test/CodeGen/AArch64/combine-storetomstore.ll
@@ -24,8 +24,8 @@ define void @test_masked_store_success_v4i16(<4 x i16> %x, ptr %ptr, <4 x i1> %m
; SVE-NEXT: shl v1.4h, v1.4h, #15
; SVE-NEXT: ptrue p0.h, vl4
; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT: st1h { z0.h }, p0, [x0]
+; SVE-NEXT: cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT: st1h { z0.h }, p1, [x0]
; SVE-NEXT: ret
%load = load <4 x i16>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x i16> %x, <4 x i16> %load
@@ -40,8 +40,8 @@ define void @test_masked_store_success_v4i32(<4 x i32> %x, ptr %ptr, <4 x i1> %m
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: shl v1.4s, v1.4s, #31
-; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0
-; SVE-NEXT: st1w { z0.s }, p0, [x0]
+; SVE-NEXT: cmpne p1.s, p0/z, z1.s, #0
+; SVE-NEXT: st1w { z0.s }, p1, [x0]
; SVE-NEXT: ret
%load = load <4 x i32>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %load
@@ -62,9 +62,9 @@ define void @test_masked_store_success_v4i64(<4 x i64> %x, ptr %ptr, <4 x i1> %m
; SVE-NEXT: shl v3.2d, v3.2d, #63
; SVE-NEXT: shl v2.2d, v2.2d, #63
; SVE-NEXT: cmpne p1.d, p0/z, z3.d, #0
-; SVE-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; SVE-NEXT: cmpne p2.d, p0/z, z2.d, #0
; SVE-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3]
-; SVE-NEXT: st1d { z0.d }, p0, [x0]
+; SVE-NEXT: st1d { z0.d }, p2, [x0]
; SVE-NEXT: ret
%load = load <4 x i64>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %load
@@ -78,8 +78,8 @@ define void @test_masked_store_success_v4f16(<4 x half> %x, ptr %ptr, <4 x i1> %
; SVE-NEXT: shl v1.4h, v1.4h, #15
; SVE-NEXT: ptrue p0.h, vl4
; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT: st1h { z0.h }, p0, [x0]
+; SVE-NEXT: cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT: st1h { z0.h }, p1, [x0]
; SVE-NEXT: ret
%load = load <4 x half>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x half> %x, <4 x half> %load
@@ -94,8 +94,8 @@ define void @test_masked_store_success_v4f32(<4 x float> %x, ptr %ptr, <4 x i1>
; SVE-NEXT: ptrue p0.s, vl4
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: shl v1.4s, v1.4s, #31
-; SVE-NEXT: cmpne p0.s, p0/z, z1.s, #0
-; SVE-NEXT: st1w { z0.s }, p0, [x0]
+; SVE-NEXT: cmpne p1.s, p0/z, z1.s, #0
+; SVE-NEXT: st1w { z0.s }, p1, [x0]
; SVE-NEXT: ret
%load = load <4 x float>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x float> %x, <4 x float> %load
@@ -116,9 +116,9 @@ define void @test_masked_store_success_v4f64(<4 x double> %x, ptr %ptr, <4 x i1>
; SVE-NEXT: shl v3.2d, v3.2d, #63
; SVE-NEXT: shl v2.2d, v2.2d, #63
; SVE-NEXT: cmpne p1.d, p0/z, z3.d, #0
-; SVE-NEXT: cmpne p0.d, p0/z, z2.d, #0
+; SVE-NEXT: cmpne p2.d, p0/z, z2.d, #0
; SVE-NEXT: st1d { z1.d }, p1, [x0, x8, lsl #3]
-; SVE-NEXT: st1d { z0.d }, p0, [x0]
+; SVE-NEXT: st1d { z0.d }, p2, [x0]
; SVE-NEXT: ret
%load = load <4 x double>, ptr %ptr, align 32
%sel = select <4 x i1> %mask, <4 x double> %x, <4 x double> %load
@@ -132,8 +132,8 @@ define void @test_masked_store_success_v8i8(<8 x i8> %x, ptr %ptr, <8 x i1> %mas
; SVE-NEXT: shl v1.8b, v1.8b, #7
; SVE-NEXT: ptrue p0.b, vl8
; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
-; SVE-NEXT: cmpne p0.b, p0/z, z1.b, #0
-; SVE-NEXT: st1b { z0.b }, p0, [x0]
+; SVE-NEXT: cmpne p1.b, p0/z, z1.b, #0
+; SVE-NEXT: st1b { z0.b }, p1, [x0]
; SVE-NEXT: ret
%load = load <8 x i8>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x i8> %x, <8 x i8> %load
@@ -148,8 +148,8 @@ define void @test_masked_store_success_v8i16(<8 x i16> %x, ptr %ptr, <8 x i1> %m
; SVE-NEXT: ptrue p0.h, vl8
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: shl v1.8h, v1.8h, #15
-; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT: st1h { z0.h }, p0, [x0]
+; SVE-NEXT: cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT: st1h { z0.h }, p1, [x0]
; SVE-NEXT: ret
%load = load <8 x i16>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %load
@@ -171,9 +171,9 @@ define void @test_masked_store_success_v8i32(<8 x i32> %x, ptr %ptr, <8 x i1> %m
; SVE-NEXT: shl v3.4s, v3.4s, #31
; SVE-NEXT: shl v2.4s, v2.4s, #31
; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0
-; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0
+; SVE-NEXT: cmpne p2.s, p0/z, z2.s, #0
; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
-; SVE-NEXT: st1w { z0.s }, p0, [x0]
+; SVE-NEXT: st1w { z0.s }, p2, [x0]
; SVE-NEXT: ret
%load = load <8 x i32>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %load
@@ -211,12 +211,12 @@ define void @test_masked_store_success_v8i64(<8 x i64> %x, ptr %ptr, <8 x i1> %m
; SVE-NEXT: shl v4.2d, v7.2d, #63
; SVE-NEXT: cmpne p2.d, p0/z, z5.d, #0
; SVE-NEXT: cmpne p3.d, p0/z, z6.d, #0
-; SVE-NEXT: cmpne p0.d, p0/z, z4.d, #0
+; SVE-NEXT: cmpne p4.d, p0/z, z4.d, #0
; SVE-NEXT: st1d { z2.d }, p1, [x0, x8, lsl #3]
; SVE-NEXT: mov x8, #2 // =0x2
; SVE-NEXT: st1d { z3.d }, p2, [x0, x9, lsl #3]
; SVE-NEXT: st1d { z1.d }, p3, [x0, x8, lsl #3]
-; SVE-NEXT: st1d { z0.d }, p0, [x0]
+; SVE-NEXT: st1d { z0.d }, p4, [x0]
; SVE-NEXT: ret
%load = load <8 x i64>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %load
@@ -231,8 +231,8 @@ define void @test_masked_store_success_v8f16(<8 x half> %x, ptr %ptr, <8 x i1> %
; SVE-NEXT: ptrue p0.h, vl8
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
; SVE-NEXT: shl v1.8h, v1.8h, #15
-; SVE-NEXT: cmpne p0.h, p0/z, z1.h, #0
-; SVE-NEXT: st1h { z0.h }, p0, [x0]
+; SVE-NEXT: cmpne p1.h, p0/z, z1.h, #0
+; SVE-NEXT: st1h { z0.h }, p1, [x0]
; SVE-NEXT: ret
%load = load <8 x half>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x half> %x, <8 x half> %load
@@ -254,9 +254,9 @@ define void @test_masked_store_success_v8f32(<8 x float> %x, ptr %ptr, <8 x i1>
; SVE-NEXT: shl v3.4s, v3.4s, #31
; SVE-NEXT: shl v2.4s, v2.4s, #31
; SVE-NEXT: cmpne p1.s, p0/z, z3.s, #0
-; SVE-NEXT: cmpne p0.s, p0/z, z2.s, #0
+; SVE-NEXT: cmpne p2.s, p0/z, z2.s, #0
; SVE-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
-; SVE-NEXT: st1w { z0.s }, p0, [x0]
+; SVE-NEXT: st1w { z0.s }, p2, [x0]
; SVE-NEXT: ret
%load = load <8 x float>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x float> %x, <8 x float> %load
@@ -294,12 +294,12 @@ define void @test_masked_store_success_v8f64(<8 x double> %x, ptr %ptr, <8 x i1>
; SVE-NEXT: shl v4.2d, v7.2d, #63
; SVE-NEXT: cmpne p2.d, p0/z, z5.d, #0
; SVE-NEXT: cmpne p3.d, p0/z, z6.d, #0
-; SVE-NEXT: cmpne p0.d, p0/z, z4.d, #0
+; SVE-NEXT: cmpne p4.d, p0/z, z4.d, #0
; SVE-NEXT: st1d { z2.d }, p1, [x0, x8, lsl #3]
; SVE-NEXT: mov x8, #2 // =0x2
; SVE-NEXT: st1d { z3.d }, p2, [x0, x9, lsl #3]
; SVE-NEXT: st1d { z1.d }, p3, [x0, x8, lsl #3]
-; SVE-NEXT: st1d { z0.d }, p0, [x0]
+; SVE-NEXT: st1d { z0.d }, p4, [x0]
; SVE-NEXT: ret
%load = load <8 x double>, ptr %ptr, align 32
%sel = select <8 x i1> %mask, <8 x double> %x, <8 x double> %load
@@ -313,8 +313,8 @@ define void @test_masked_store_success_v16i8(<16 x i8> %x, ptr %ptr, <16 x i1> %
; SVE-NEXT: shl v1.16b, v1.16b, #7
; SVE-NEXT: ptrue p0.b, vl16
; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
-; SVE-NEXT: cmpne p0.b, p0/z, z1.b, #0
-; SVE-NEXT: st1b { z0.b }, p0, [x0]
+; SVE-NEXT: cmpne p1.b, p0/z, z1.b, #0
+; SVE-NEXT: st1b { z0.b }, p1, [x0]
; SVE-NEXT: ret
%load = load <16 x i8>, ptr %ptr, align 32
%sel = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %load
@@ -334,9 +334,9 @@ define void @test_masked_store_success_v16i16(<16 x i16> %x, ptr %ptr, <16 x i1>
; SVE-NEXT: shl v3.8h, v3.8h, #15
; SVE-NEXT: shl v2.8h, v2.8h, #15
; SVE-NEXT: cmpne p1.h, p0/z, z3.h, #0
-; SVE-NEXT: cmpne p0.h, p0/z, z2.h, #0
+; SVE-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/190139
More information about the llvm-commits
mailing list