[llvm] [AArch64] Lower abds and abdu on AArch64 (PR #159085)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 06:08:47 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: AZero13 (AZero13)
<details>
<summary>Changes</summary>
---
Patch is 32.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159085.diff
6 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+112-33)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.cpp (+16)
- (modified) llvm/test/CodeGen/AArch64/abs.ll (+30)
- (modified) llvm/test/CodeGen/AArch64/alias_mask.ll (+120-128)
- (removed) llvm/test/CodeGen/AArch64/csel-subs-dag-combine.ll (-112)
- (modified) llvm/test/CodeGen/AArch64/pr72777.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d7c90bcb9723d..648710be3254d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -698,6 +698,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ABS, MVT::i64, Custom);
}
+ setOperationAction(ISD::ABDS, MVT::i32, Custom);
+ setOperationAction(ISD::ABDS, MVT::i64, Custom);
+ setOperationAction(ISD::ABDU, MVT::i32, Custom);
+ setOperationAction(ISD::ABDU, MVT::i64, Custom);
+
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -3653,7 +3658,8 @@ static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
- const SDLoc &DL, SelectionDAG &DAG) {
+ const SDLoc &DL, SelectionDAG &DAG,
+ bool MIOrPLSupported = false) {
EVT VT = LHS.getValueType();
const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
@@ -3696,6 +3702,33 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
// Use result of ANDS
return LHS.getValue(1);
+ } else if (MIOrPLSupported) {
+ // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+ if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+ // SUB(x, y) < 0 -> SUBS(x, y)
+ return DAG
+ .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+ // ADD(x, y) >= 0 -> ADDS(x, y)
+ return DAG
+ .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+ // ADD(x, y) < 0 -> SUBS(x, y)
+ return DAG
+ .getNode(AArch64ISD::ADDS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+ // SUB(x, y) >= 0 -> ADDS(x, y)
+ return DAG
+ .getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ LHS.getOperand(0), LHS.getOperand(1))
+ .getValue(1);
+ }
}
}
@@ -3760,7 +3793,8 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
AArch64CC::CondCode Predicate,
AArch64CC::CondCode OutCC,
- const SDLoc &DL, SelectionDAG &DAG) {
+ const SDLoc &DL, SelectionDAG &DAG,
+ bool MIOrPLSupported = false) {
unsigned Opcode = 0;
const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
@@ -3787,6 +3821,30 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
// we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
Opcode = AArch64ISD::CCMN;
LHS = LHS.getOperand(1);
+ } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC) &&
+ MIOrPLSupported) {
+ // For MIOrPLSupported, optimize SUB/ADD operations with zero comparison
+ if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETLT) {
+ // SUB(x, y) < 0 -> CCMP(x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMP;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETGE) {
+ // ADD(x, y) >= 0 -> CCMP(x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMN;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::ADD && CC == ISD::SETLT) {
+ // ADD(x, y) < 0 -> CCMP(x, -y) with appropriate condition
+ Opcode = AArch64ISD::CCMN;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ } else if (LHS.getOpcode() == ISD::SUB && CC == ISD::SETGE) {
+ // SUB(x, y) >= 0 -> CCMP(-x, y) with appropriate condition
+ Opcode = AArch64ISD::CCMP;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ }
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
@@ -3913,7 +3971,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
- DAG);
+ DAG, true);
}
assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
@@ -4192,7 +4250,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
if (!Cmp) {
- Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
+ Cmp = emitComparison(LHS, RHS, CC, DL, DAG, true);
AArch64CC = changeIntCCToAArch64CC(CC, RHS);
}
AArch64cc = getCondCode(DAG, AArch64CC);
@@ -7312,13 +7370,57 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
SDLoc DL(Op);
- SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- Op.getOperand(0));
- // Generate SUBS & CSEL.
- SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
+
+ // Generate CMP & CSEL.
+ SDValue Cmp = emitComparison(Op.getOperand(0), DAG.getConstant(0, DL, VT),
+ ISD::SETGE, DL, DAG, true);
+ SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
- getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
+ getCondCode(DAG, AArch64CC::PL), Cmp);
+}
+
+// Generate SUBS and CNEG for absolute difference.
+SDValue AArch64TargetLowering::LowerABD(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+
+ bool IsSigned = Op.getOpcode() == ISD::ABDS;
+ if (VT.isVector()) {
+ if (IsSigned)
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
+ else
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
+ }
+
+ // If the subtract doesn't overflow then just use abs(sub())
+ bool IsNonNegative = DAG.SignBitIsZero(LHS) && DAG.SignBitIsZero(RHS);
+
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, LHS, RHS))
+ return DAG.getNode(ISD::ABS, DL, VT,
+ DAG.getNode(ISD::SUB, DL, VT, LHS, RHS));
+
+ if (DAG.willNotOverflowSub(IsSigned || IsNonNegative, RHS, LHS))
+ return DAG.getNode(ISD::ABS, DL, VT,
+ DAG.getNode(ISD::SUB, DL, VT, RHS, LHS));
+
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+
+ // Generate SUBS and CSEL for absolute difference (like LowerABS)
+ // Compute a - b with flags
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS);
+
+ // Compute b - a (negative of a - b)
+ SDValue Neg = DAG.getNegative(Cmp.getValue(0), DL, VT);
+
+ // For unsigned: use HS (a >= b) to select a-b, otherwise b-a
+ // For signed: use GE (a >= b) to select a-b, otherwise b-a
+ AArch64CC::CondCode CC = IsSigned ? AArch64CC::GT : AArch64CC::HI;
+
+ // CSEL: if a > b, select a-b, otherwise b-a
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, Cmp.getValue(0), Neg,
+ getCondCode(DAG, CC), Cmp.getValue(1));
}
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
@@ -25857,29 +25959,6 @@ static SDValue performCSELCombine(SDNode *N,
}
}
- // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
- // use overflow flags, to avoid the comparison with zero. In case of success,
- // this also replaces the original SUB(x,y) with the newly created SUBS(x,y).
- // NOTE: Perhaps in the future use performFlagSettingCombine to replace SUB
- // nodes with their SUBS equivalent as is already done for other flag-setting
- // operators, in which case doing the replacement here becomes redundant.
- if (Cond.getOpcode() == AArch64ISD::SUBS && Cond->hasNUsesOfValue(1, 1) &&
- isNullConstant(Cond.getOperand(1))) {
- SDValue Sub = Cond.getOperand(0);
- AArch64CC::CondCode CC =
- static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
- if (Sub.getOpcode() == ISD::SUB &&
- (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
- CC == AArch64CC::PL)) {
- SDLoc DL(N);
- SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
- Sub.getOperand(0), Sub.getOperand(1));
- DCI.CombineTo(Sub.getNode(), Subs);
- DCI.CombineTo(Cond.getNode(), Subs, Subs.getValue(1));
- return SDValue(N, 0);
- }
- }
-
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9a7512b77ecdb..f2468910a39cb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1730,12 +1730,20 @@ static unsigned sForm(MachineInstr &Instr) {
case AArch64::ADDSWrr:
case AArch64::ADDSWri:
+ case AArch64::ADDSWrx:
case AArch64::ADDSXrr:
case AArch64::ADDSXri:
+ case AArch64::ADDSXrx:
case AArch64::SUBSWrr:
case AArch64::SUBSWri:
+ case AArch64::SUBSWrx:
case AArch64::SUBSXrr:
case AArch64::SUBSXri:
+ case AArch64::SUBSXrx:
+ case AArch64::ADCSWr:
+ case AArch64::ADCSXr:
+ case AArch64::SBCSWr:
+ case AArch64::SBCSXr:
return Instr.getOpcode();
case AArch64::ADDWrr:
@@ -1746,6 +1754,10 @@ static unsigned sForm(MachineInstr &Instr) {
return AArch64::ADDSXrr;
case AArch64::ADDXri:
return AArch64::ADDSXri;
+ case AArch64::ADDWrx:
+ return AArch64::ADDSWrx;
+ case AArch64::ADDXrx:
+ return AArch64::ADDSXrx;
case AArch64::ADCWr:
return AArch64::ADCSWr;
case AArch64::ADCXr:
@@ -1758,6 +1770,10 @@ static unsigned sForm(MachineInstr &Instr) {
return AArch64::SUBSXrr;
case AArch64::SUBXri:
return AArch64::SUBSXri;
+ case AArch64::SUBWrx:
+ return AArch64::SUBSWrx;
+ case AArch64::SUBXrx:
+ return AArch64::SUBSXrx;
case AArch64::SBCWr:
return AArch64::SBCSWr;
case AArch64::SBCXr:
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 0f56d25a47b2a..92d8ba242e0de 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -388,3 +388,33 @@ entry:
ret <3 x i32> %res
}
declare <3 x i32> @llvm.abs.v3i32(<3 x i32>, i1)
+
+define i32 @combine_subs_multiple_sub_uses(i32 %a, i32 %b) {
+; CHECK-LABEL: combine_subs_multiple_sub_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: subs w8, w0, w1
+; CHECK-NEXT: csel w9, w0, w1, ne
+; CHECK-NEXT: add w0, w9, w8
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cc = icmp ne i32 %sub, 0
+ %sel = select i1 %cc, i32 %a, i32 %b
+ %add = add i32 %sel, %sub
+ ret i32 %add
+}
+
+define i32 @do_not_combine_subs_multiple_flag_uses(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: do_not_combine_subs_multiple_flag_uses:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, w1
+; CHECK-NEXT: csel w8, w0, w1, ne
+; CHECK-NEXT: csel w9, w2, w3, ne
+; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cc = icmp ne i32 %sub, 0
+ %sel = select i1 %cc, i32 %a, i32 %b
+ %other = select i1 %cc, i32 %c, i32 %d
+ %add = add i32 %sel, %other
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll
index 9b9c020016bab..48c66ad5bac1c 100644
--- a/llvm/test/CodeGen/AArch64/alias_mask.ll
+++ b/llvm/test/CodeGen/AArch64/alias_mask.ll
@@ -393,70 +393,70 @@ entry:
define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_32_expand3:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: subs x9, x1, x0
+; CHECK-NEXT: subs x10, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: add x10, x9, #3
-; CHECK-NEXT: sub x11, x9, #61
-; CHECK-NEXT: csel x10, x10, x9, mi
-; CHECK-NEXT: subs x9, x9, #64
-; CHECK-NEXT: csel x9, x11, x9, mi
-; CHECK-NEXT: asr x10, x10, #2
-; CHECK-NEXT: asr x9, x9, #2
-; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: add x9, x10, #3
+; CHECK-NEXT: sub x12, x10, #61
+; CHECK-NEXT: csel x9, x9, x10, mi
+; CHECK-NEXT: asr x11, x9, #2
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: dup v1.2d, x11
; CHECK-NEXT: mov z5.d, z0.d
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: subs x10, x10, #64
; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x10
-; CHECK-NEXT: dup v16.2d, x9
-; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT: csel x10, x12, x10, mi
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT: asr x10, x10, #2
+; CHECK-NEXT: add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT: cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT: cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT: cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT: cmp x10, #1
+; CHECK-NEXT: cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT: cset w10, lt
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d
-; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d
-; CHECK-NEXT: cset w9, lt
-; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d
-; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s
-; CHECK-NEXT: cmp x10, #1
-; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s
-; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s
-; CHECK-NEXT: cset w10, lt
-; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s
-; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s
-; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h
-; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h
+; CHECK-NEXT: cmhi v23.2d, v16.2d, v6.2d
+; CHECK-NEXT: cmhi v24.2d, v16.2d, v7.2d
+; CHECK-NEXT: cmhi v6.2d, v1.2d, v6.2d
+; CHECK-NEXT: cmhi v16.2d, v16.2d, v0.2d
+; CHECK-NEXT: cmhi v7.2d, v1.2d, v7.2d
+; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d
+; CHECK-NEXT: uzp1 v1.4s, v21.4s, v20.4s
+; CHECK-NEXT: uzp1 v3.4s, v4.4s, v3.4s
+; CHECK-NEXT: uzp1 v4.4s, v23.4s, v5.4s
+; CHECK-NEXT: uzp1 v5.4s, v18.4s, v24.4s
+; CHECK-NEXT: uzp1 v6.4s, v6.4s, v22.4s
+; CHECK-NEXT: uzp1 v2.4s, v2.4s, v16.4s
+; CHECK-NEXT: uzp1 v7.4s, v17.4s, v7.4s
+; CHECK-NEXT: uzp1 v0.4s, v19.4s, v0.4s
; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h
-; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h
+; CHECK-NEXT: uzp1 v2.8h, v3.8h, v2.8h
+; CHECK-NEXT: uzp1 v3.8h, v7.8h, v6.8h
+; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT: uzp1 v1.16b, v4.16b, v2.16b
+; CHECK-NEXT: uzp1 v0.16b, v3.16b, v0.16b
; CHECK-NEXT: dup v2.16b, w10
-; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b
; CHECK-NEXT: dup v3.16b, w9
; CHECK-NEXT: adrp x9, .LCPI14_0
-; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b
; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0]
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
@@ -469,8 +469,8 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: str h1, [x8]
-; CHECK-NEXT: str h0, [x8, #2]
+; CHECK-NEXT: str h1, [x8, #2]
+; CHECK-NEXT: str h0, [x8]
; CHECK-NEXT: ret
entry:
%0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4)
@@ -586,70 +586,70 @@ entry:
define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) {
; CHECK-LABEL: whilewr_64_expand4:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: subs x9, x1, x0
+; CHECK-NEXT: subs x10, x1, x0
; CHECK-NEXT: index z0.d, #0, #1
-; CHECK-NEXT: add x10, x9, #7
-; CHECK-NEXT: sub x11, x9, #121
-; CHECK-NEXT: csel x10, x10, x9, mi
-; CHECK-NEXT: subs x9, x9, #128
-; CHECK-NEXT: csel x9, x11, x9, mi
-; CHECK-NEXT: asr x10, x10, #3
-; CHECK-NEXT: asr x9, x9, #3
-; CHECK-NEXT: mov z1.d, z0.d
+; CHECK-NEXT: add x9, x10, #7
+; CHECK-NEXT: sub x12, x10, #121
+; CHECK-NEXT: csel x9, x9, x10, mi
+; CHECK-NEXT: asr x11, x9, #3
; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z4.d, z0.d
+; CHECK-NEXT: cmp x11, #1
+; CHECK-NEXT: dup v1.2d, x11
; CHECK-NEXT: mov z5.d, z0.d
+; CHECK-NEXT: cset w9, lt
+; CHECK-NEXT: subs x10, x10, #128
; CHECK-NEXT: mov z6.d, z0.d
-; CHECK-NEXT: dup v7.2d, x10
-; CHECK-NEXT: dup v16.2d, x9
-; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc
-; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa
-; CHECK-NEXT: cmp x9, #1
-; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8
-; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6
-; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4
-; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2
-; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d
+; CHECK-NEXT: csel x10, x12, x10, mi
+; CHECK-NEXT: mov z7.d, z0.d
+; CHECK-NEXT: add z2.d, z2.d, #12 // =0xc
+; CHECK-NEXT: asr x10, x10, #3
+; CHECK-NEXT: add z3.d, z3.d, #10 // =0xa
+; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8
+; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6
+; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4
+; CHECK-NEXT: cmhi v17.2d, v1.2d, v0.2d
+; CHECK-NEXT: dup v16.2d, x10
+; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2
+; CHECK-NEXT: cmhi v19.2d, v1.2d, v2.2d
+; CHECK-NEXT: cmhi v20.2d, v1.2d, v3.2d
+; CHECK-NEXT: cmhi v21.2d, v1.2d, v4.2d
+; CHECK-NEXT: cmp x10, #1
+; CHECK-NEXT: cmhi v22.2d, v1.2d, v5.2d
+; CHECK-NEXT: cset w10, lt
; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d
; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe
-; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d
-; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d
-; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d
-; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d
-; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d
-; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d
-; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d
; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d
; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d
; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d
-; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d
; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/159085
More information about the llvm-commits
mailing list