[llvm] [SelectionDAG] Use Magic Algorithm for Splitting UDIV/UREM by Constant (PR #154968)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 08:22:52 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Marius Kamp (mskamp)
<details>
<summary>Changes</summary>
For integer types twice as large as a legal type, we have previously
generated a library call if another splitting technique was not
applicable.
With this change, we use an adaption of the Magic algorithm. This
algorithm is also used for UDIV/UREM by constants on legal types. The
implementation introduced here is a simple port of the already existing
implementation to types twice the size of a legal type. The core idea of
this algorithm is to replace (udiv x c) for a constant c with the bits
higher or equal to the s-th bit of the multiplication of x by (2^s + o)/c
for some s and o. More details are available in Henry S. Warren, Jr.:
"Hacker's Delight", chapter 10.
An efficient handling of UDIV/UREM by constants on types twice as large
as a legal type is mostly relevant for 32-bit platforms. But some
projects may also benefit on 64-bit platforms. For example, the `fmt`
library for C++ uses 128-bit unsigned divisions by 100 and 10000, which
have not been covered by the previously existing optimizations.
Closes #<!-- -->137514.
---
Patch is 138.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154968.diff
15 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+159-33)
- (modified) llvm/test/CodeGen/AArch64/rem-by-const.ll (+210-70)
- (modified) llvm/test/CodeGen/ARM/funnel-shift.ll (+130-110)
- (modified) llvm/test/CodeGen/Mips/funnel-shift.ll (+188-185)
- (modified) llvm/test/CodeGen/PowerPC/funnel-shift.ll (+134-178)
- (modified) llvm/test/CodeGen/PowerPC/urem-lkk.ll (+82-18)
- (modified) llvm/test/CodeGen/RISCV/div-by-constant.ll (+42-7)
- (modified) llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll (+144-28)
- (modified) llvm/test/CodeGen/RISCV/split-urem-by-constant.ll (+192-28)
- (modified) llvm/test/CodeGen/RISCV/urem-lkk.ll (+49-13)
- (modified) llvm/test/CodeGen/RISCV/urem-vector-lkk.ll (+158-46)
- (modified) llvm/test/CodeGen/X86/divide-by-constant.ll (+458-32)
- (modified) llvm/test/CodeGen/X86/divmod128.ll (+625-15)
- (modified) llvm/test/CodeGen/X86/funnel-shift.ll (+73-39)
- (modified) llvm/test/CodeGen/X86/i128-udiv.ll (+37-9)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 402a012e8e555..9e1a51e291ddb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8011,25 +8011,12 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
// dividend and multiply by the multiplicative inverse of the shifted divisor.
// If we want the remainder, we shift the value left by the number of trailing
// zeros and add the bits that were shifted out of the dividend.
-bool TargetLowering::expandDIVREMByConstant(SDNode *N,
- SmallVectorImpl<SDValue> &Result,
- EVT HiLoVT, SelectionDAG &DAG,
- SDValue LL, SDValue LH) const {
+static bool expandUDIVREMByConstantViaUREMDecomposition(
+ SDNode *N, APInt Divisor, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
+ SelectionDAG &DAG, SDValue LL, SDValue LH, const TargetLowering &TLI) {
unsigned Opcode = N->getOpcode();
EVT VT = N->getValueType(0);
- // TODO: Support signed division/remainder.
- if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
- return false;
- assert(
- (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
- "Unexpected opcode");
-
- auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!CN)
- return false;
-
- APInt Divisor = CN->getAPIntValue();
unsigned BitWidth = Divisor.getBitWidth();
unsigned HBitWidth = BitWidth / 2;
assert(VT.getScalarSizeInBits() == BitWidth &&
@@ -8040,20 +8027,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (Divisor.uge(HalfMaxPlus1))
return false;
- // We depend on the UREM by constant optimization in DAGCombiner that requires
- // high multiply.
- if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
- !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
- return false;
-
- // Don't expand if optimizing for size.
- if (DAG.shouldOptForSize())
- return false;
-
- // Early out for 0 or 1 divisors.
- if (Divisor.ule(1))
- return false;
-
// If the divisor is even, shift it until it becomes odd.
unsigned TrailingZeros = 0;
if (!Divisor[0]) {
@@ -8097,8 +8070,8 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// Use uaddo_carry if we can, otherwise use a compare to detect overflow.
EVT SetCCType =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
- if (isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) {
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), HiLoVT);
+ if (TLI.isOperationLegalOrCustom(ISD::UADDO_CARRY, HiLoVT)) {
SDVTList VTList = DAG.getVTList(HiLoVT, SetCCType);
Sum = DAG.getNode(ISD::UADDO, dl, VTList, LL, LH);
Sum = DAG.getNode(ISD::UADDO_CARRY, dl, VTList, Sum,
@@ -8108,7 +8081,7 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
SDValue Carry = DAG.getSetCC(dl, SetCCType, Sum, LL, ISD::SETULT);
// If the boolean for the target is 0 or 1, we can add the setcc result
// directly.
- if (getBooleanContents(HiLoVT) ==
+ if (TLI.getBooleanContents(HiLoVT) ==
TargetLoweringBase::ZeroOrOneBooleanContent)
Carry = DAG.getZExtOrTrunc(Carry, dl, HiLoVT);
else
@@ -8164,6 +8137,159 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
return true;
}
+static bool
+expandUDIVREMByConstantViaUMulHiMagic(SDNode *N, const APInt &Divisor,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG, SDValue LL,
+ SDValue LH, const TargetLowering &TLI) {
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N0->getValueType(0);
+ SDLoc DL{N};
+
+ assert(!Divisor.isOne() && "Magic algorithm does not work for division by 1");
+
+ // This helper creates a MUL_LOHI of the pair (LL, LH) by a constant.
+ auto MakeMUL_LOHIByConst = [&](unsigned Opc, SDValue LL, SDValue LH,
+ const APInt &Const,
+ SmallVectorImpl<SDValue> &Result) {
+ SDValue LHS = DAG.getNode(ISD::BUILD_PAIR, DL, VT, LL, LH);
+ SDValue RHS = DAG.getConstant(Const, DL, VT);
+ auto [RL, RH] = DAG.SplitScalar(RHS, DL, HiLoVT, HiLoVT);
+ return TLI.expandMUL_LOHI(
+ Opc, VT, DL, LHS, RHS, Result, HiLoVT, DAG,
+ TargetLowering::MulExpansionKind::OnlyLegalOrCustom, LL, LH, RL, RH);
+ };
+
+ // This helper creates an ADD/SUB of the pairs (LL, LH) and (RL, RH).
+ auto MakeAddSubLong = [&](unsigned Opc, SDValue LL, SDValue LH, SDValue RL,
+ SDValue RH) {
+ SDValue AddSubNode =
+ DAG.getNode(Opc == ISD::ADD ? ISD::UADDO : ISD::USUBO, DL,
+ DAG.getVTList(HiLoVT, MVT::i1), LL, RL);
+ SDValue OutL, OutH, Overflow;
+ TLI.expandUADDSUBO(AddSubNode.getNode(), OutL, Overflow, DAG);
+ SDValue WithOverflow = DAG.getNode(
+ Opc, DL, HiLoVT, LH, DAG.getZExtOrTrunc(Overflow, DL, HiLoVT));
+ OutH = DAG.getNode(Opc, DL, HiLoVT, WithOverflow, RH);
+ return std::make_pair(OutL, OutH);
+ };
+
+ // This helper creates a SRL of the pair (LL, LH) by Shift.
+ auto MakeSRLLong = [&](SDValue LL, SDValue LH, unsigned Shift) {
+ unsigned HBitWidth = HiLoVT.getScalarSizeInBits();
+ if (Shift < HBitWidth) {
+ SDValue ShAmt = DAG.getConstant(Shift, DL, HiLoVT);
+ SDValue ResL = DAG.getNode(ISD::FSHR, DL, HiLoVT, LH, LL, ShAmt);
+ SDValue ResH = DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt);
+ return std::make_pair(ResL, ResH);
+ }
+ SDValue Zero = DAG.getConstant(0, DL, HiLoVT);
+ if (Shift == HBitWidth)
+ return std::make_pair(LH, Zero);
+ assert(Shift - HBitWidth < HBitWidth &&
+ "We shouldn't generate an undefined shift");
+ SDValue ShAmt = DAG.getConstant(Shift - HBitWidth, DL, HiLoVT);
+ return std::make_pair(DAG.getNode(ISD::SRL, DL, HiLoVT, LH, ShAmt), Zero);
+ };
+
+ // Knowledge of leading zeros may help to reduce the multiplier.
+ unsigned KnownLeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+
+ UnsignedDivisionByConstantInfo Magics = UnsignedDivisionByConstantInfo::get(
+ Divisor, std::min(KnownLeadingZeros, Divisor.countl_zero()));
+
+ assert(!LL == !LH && "Expected both input halves or no input halves!");
+ if (!LL)
+ std::tie(LL, LH) = DAG.SplitScalar(N0, DL, HiLoVT, HiLoVT);
+ SDValue QL = LL;
+ SDValue QH = LH;
+ if (Magics.PreShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PreShift);
+
+ SmallVector<SDValue, 2> UMulResult;
+ if (!MakeMUL_LOHIByConst(ISD::UMUL_LOHI, QL, QH, Magics.Magic, UMulResult))
+ return false;
+
+ QL = UMulResult[2];
+ QH = UMulResult[3];
+
+ if (Magics.IsAdd) {
+ auto [NPQL, NPQH] = MakeAddSubLong(ISD::SUB, LL, LH, QL, QH);
+ std::tie(NPQL, NPQH) = MakeSRLLong(NPQL, NPQH, 1);
+ std::tie(QL, QH) = MakeAddSubLong(ISD::ADD, NPQL, NPQH, QL, QH);
+ }
+
+ if (Magics.PostShift != 0)
+ std::tie(QL, QH) = MakeSRLLong(QL, QH, Magics.PostShift);
+
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::UREM) {
+ Result.push_back(QL);
+ Result.push_back(QH);
+ }
+
+ if (Opcode != ISD::UDIV) {
+ SmallVector<SDValue, 2> MulResult;
+ if (!MakeMUL_LOHIByConst(ISD::MUL, QL, QH, Divisor, MulResult))
+ return false;
+
+ assert(MulResult.size() == 2);
+
+ auto [RemL, RemH] =
+ MakeAddSubLong(ISD::SUB, LL, LH, MulResult[0], MulResult[1]);
+
+ Result.push_back(RemL);
+ Result.push_back(RemH);
+ }
+
+ return true;
+}
+
+bool TargetLowering::expandDIVREMByConstant(SDNode *N,
+ SmallVectorImpl<SDValue> &Result,
+ EVT HiLoVT, SelectionDAG &DAG,
+ SDValue LL, SDValue LH) const {
+ unsigned Opcode = N->getOpcode();
+
+ // TODO: Support signed division/remainder.
+ if (Opcode == ISD::SREM || Opcode == ISD::SDIV || Opcode == ISD::SDIVREM)
+ return false;
+ assert(
+ (Opcode == ISD::UREM || Opcode == ISD::UDIV || Opcode == ISD::UDIVREM) &&
+ "Unexpected opcode");
+
+ auto *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CN)
+ return false;
+
+ APInt Divisor = CN->getAPIntValue();
+
+ // We depend on the UREM by constant optimization in DAGCombiner that requires
+ // high multiply.
+ if (!isOperationLegalOrCustom(ISD::MULHU, HiLoVT) &&
+ !isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT))
+ return false;
+
+ // Don't expand if optimizing for size.
+ if (DAG.shouldOptForSize())
+ return false;
+
+ // Early out for 0 or 1 divisors.
+ if (Divisor.ule(1))
+ return false;
+
+ if (expandUDIVREMByConstantViaUREMDecomposition(N, Divisor, Result, HiLoVT,
+ DAG, LL, LH, *this))
+ return true;
+
+ if (expandUDIVREMByConstantViaUMulHiMagic(N, Divisor, Result, HiLoVT, DAG, LL,
+ LH, *this))
+ return true;
+
+ return false;
+}
+
// Check that (every element of) Z is undef or not an exact multiple of BW.
static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) {
return ISD::matchUnaryPredicate(
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index c57383ad9b1e7..0554b2e66a0be 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -513,13 +513,50 @@ entry:
define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: .cfi_offset w30, -16
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #9362 // =0x2492
+; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
+; CHECK-SD-NEXT: movk x8, #37449, lsl #16
+; CHECK-SD-NEXT: movk x11, #9362, lsl #16
+; CHECK-SD-NEXT: movk x8, #18724, lsl #32
+; CHECK-SD-NEXT: movk x11, #37449, lsl #32
+; CHECK-SD-NEXT: movk x8, #9362, lsl #48
+; CHECK-SD-NEXT: movk x11, #18724, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mul x11, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: mul x8, x1, x8
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: cinc x10, x13, hs
+; CHECK-SD-NEXT: adds x11, x9, x8
+; CHECK-SD-NEXT: cinc x12, x10, hs
+; CHECK-SD-NEXT: subs x13, x0, x11
+; CHECK-SD-NEXT: cset w14, lo
+; CHECK-SD-NEXT: sub x14, x1, x14
+; CHECK-SD-NEXT: sub x12, x14, x12
+; CHECK-SD-NEXT: extr x13, x12, x13, #1
+; CHECK-SD-NEXT: lsr x12, x12, #1
+; CHECK-SD-NEXT: adds x11, x13, x11
+; CHECK-SD-NEXT: cinc x12, x12, hs
+; CHECK-SD-NEXT: cmn x9, x8
+; CHECK-SD-NEXT: adc x8, x12, x10
+; CHECK-SD-NEXT: mov w10, #7 // =0x7
+; CHECK-SD-NEXT: extr x9, x8, x11, #2
+; CHECK-SD-NEXT: lsr x8, x8, #2
+; CHECK-SD-NEXT: umulh x10, x9, x10
+; CHECK-SD-NEXT: lsl x11, x9, #3
+; CHECK-SD-NEXT: sub x9, x11, x9
+; CHECK-SD-NEXT: subs x0, x0, x9
+; CHECK-SD-NEXT: cset w9, lo
+; CHECK-SD-NEXT: sub x10, x10, x8
+; CHECK-SD-NEXT: sub x9, x1, x9
+; CHECK-SD-NEXT: add x8, x10, x8, lsl #3
+; CHECK-SD-NEXT: sub x1, x9, x8
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_7:
@@ -596,13 +633,38 @@ entry:
define i128 @ui128_100(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_100:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: .cfi_offset w30, -16
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2
+; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29
+; CHECK-SD-NEXT: movk x8, #23592, lsl #16
+; CHECK-SD-NEXT: movk x11, #49807, lsl #16
+; CHECK-SD-NEXT: movk x8, #49807, lsl #32
+; CHECK-SD-NEXT: movk x11, #10485, lsl #32
+; CHECK-SD-NEXT: movk x8, #10485, lsl #48
+; CHECK-SD-NEXT: movk x11, #36700, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mul x11, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: mul x8, x1, x8
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: cinc x10, x13, hs
+; CHECK-SD-NEXT: adds x8, x9, x8
+; CHECK-SD-NEXT: cinc x9, x10, hs
+; CHECK-SD-NEXT: mov w10, #100 // =0x64
+; CHECK-SD-NEXT: extr x8, x9, x8, #4
+; CHECK-SD-NEXT: lsr x9, x9, #4
+; CHECK-SD-NEXT: umulh x11, x8, x10
+; CHECK-SD-NEXT: mul x8, x8, x10
+; CHECK-SD-NEXT: madd x9, x9, x10, x11
+; CHECK-SD-NEXT: subs x0, x0, x8
+; CHECK-SD-NEXT: cset w8, lo
+; CHECK-SD-NEXT: sub x8, x1, x8
+; CHECK-SD-NEXT: sub x1, x8, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_100:
@@ -3204,34 +3266,85 @@ entry:
define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_7:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w30, -48
-; CHECK-SD-NEXT: mov x19, x3
-; CHECK-SD-NEXT: mov x20, x2
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x21, x0
-; CHECK-SD-NEXT: mov x22, x1
-; CHECK-SD-NEXT: mov x0, x20
-; CHECK-SD-NEXT: mov x1, x19
-; CHECK-SD-NEXT: mov w2, #7 // =0x7
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x2, x0
-; CHECK-SD-NEXT: mov x3, x1
-; CHECK-SD-NEXT: mov x0, x21
-; CHECK-SD-NEXT: mov x1, x22
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #9362 // =0x2492
+; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
+; CHECK-SD-NEXT: movk x8, #37449, lsl #16
+; CHECK-SD-NEXT: movk x11, #9362, lsl #16
+; CHECK-SD-NEXT: movk x8, #18724, lsl #32
+; CHECK-SD-NEXT: movk x11, #37449, lsl #32
+; CHECK-SD-NEXT: movk x8, #9362, lsl #48
+; CHECK-SD-NEXT: movk x11, #18724, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: mul x15, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: cmn x10, x15
+; CHECK-SD-NEXT: mul x16, x1, x8
+; CHECK-SD-NEXT: adcs x9, x9, x14
+; CHECK-SD-NEXT: mul x12, x2, x8
+; CHECK-SD-NEXT: cinc x13, x13, hs
+; CHECK-SD-NEXT: umulh x10, x2, x11
+; CHECK-SD-NEXT: adds x14, x9, x16
+; CHECK-SD-NEXT: cinc x15, x13, hs
+; CHECK-SD-NEXT: subs x18, x0, x14
+; CHECK-SD-NEXT: umulh x17, x2, x8
+; CHECK-SD-NEXT: cset w5, lo
+; CHECK-SD-NEXT: sub x5, x1, x5
+; CHECK-SD-NEXT: umulh x6, x3, x11
+; CHECK-SD-NEXT: sub x15, x5, x15
+; CHECK-SD-NEXT: extr x18, x15, x18, #1
+; CHECK-SD-NEXT: mul x11, x3, x11
+; CHECK-SD-NEXT: lsr x15, x15, #1
+; CHECK-SD-NEXT: umulh x4, x3, x8
+; CHECK-SD-NEXT: adds x14, x18, x14
+; CHECK-SD-NEXT: cinc x15, x15, hs
+; CHECK-SD-NEXT: cmn x9, x16
+; CHECK-SD-NEXT: mul x8, x3, x8
+; CHECK-SD-NEXT: adc x9, x15, x13
+; CHECK-SD-NEXT: adds x10, x10, x12
+; CHECK-SD-NEXT: cinc x12, x17, hs
+; CHECK-SD-NEXT: cmn x10, x11
+; CHECK-SD-NEXT: adcs x10, x12, x6
+; CHECK-SD-NEXT: cinc x11, x4, hs
+; CHECK-SD-NEXT: adds x12, x10, x8
+; CHECK-SD-NEXT: cinc x13, x11, hs
+; CHECK-SD-NEXT: subs x15, x2, x12
+; CHECK-SD-NEXT: cset w16, lo
+; CHECK-SD-NEXT: sub x16, x3, x16
+; CHECK-SD-NEXT: sub x13, x16, x13
+; CHECK-SD-NEXT: extr x15, x13, x15, #1
+; CHECK-SD-NEXT: lsr x13, x13, #1
+; CHECK-SD-NEXT: adds x12, x15, x12
+; CHECK-SD-NEXT: cinc x13, x13, hs
+; CHECK-SD-NEXT: cmn x10, x8
+; CHECK-SD-NEXT: extr x8, x9, x14, #2
+; CHECK-SD-NEXT: adc x10, x13, x11
+; CHECK-SD-NEXT: mov w11, #7 // =0x7
+; CHECK-SD-NEXT: lsr x9, x9, #2
+; CHECK-SD-NEXT: extr x12, x10, x12, #2
+; CHECK-SD-NEXT: umulh x13, x8, x11
+; CHECK-SD-NEXT: lsl x14, x8, #3
+; CHECK-SD-NEXT: lsr x10, x10, #2
+; CHECK-SD-NEXT: umulh x11, x12, x11
+; CHECK-SD-NEXT: lsl x15, x12, #3
+; CHECK-SD-NEXT: sub x8, x14, x8
+; CHECK-SD-NEXT: subs x0, x0, x8
+; CHECK-SD-NEXT: sub x8, x15, x12
+; CHECK-SD-NEXT: cset w12, lo
+; CHECK-SD-NEXT: sub x13, x13, x9
+; CHECK-SD-NEXT: subs x2, x2, x8
+; CHECK-SD-NEXT: add x8, x13, x9, lsl #3
+; CHECK-SD-NEXT: sub x11, x11, x10
+; CHECK-SD-NEXT: add x9, x11, x10, lsl #3
+; CHECK-SD-NEXT: cset w10, lo
+; CHECK-SD-NEXT: sub x11, x1, x12
+; CHECK-SD-NEXT: sub x10, x3, x10
+; CHECK-SD-NEXT: sub x1, x11, x8
+; CHECK-SD-NEXT: sub x3, x10, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_7:
@@ -3361,34 +3474,61 @@ entry:
define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_100:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w30, -48
-; CHECK-SD-NEXT: mov x19, x3
-; CHECK-SD-NEXT: mov x20, x2
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x21, x0
-; CHECK-SD-NEXT: mov x22, x1
-; CHECK-SD-NEXT: mov x0, x20
-; CHECK-SD-NEXT: mov x1, x19
-; CHECK-SD-NEXT: mov w2, #100 // =0x64
-; CHECK-SD-NEXT: mov x3, xzr
-; CHECK-SD-NEXT: bl __umodti3
-; CHECK-SD-NEXT: mov x2, x0
-; CHECK-SD-NEXT: mov x3, x1
-; CHECK-SD-NEXT: mov x0, x21
-; CHECK-SD-NEXT: mov x1, x22
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x8, #62914 // =0xf5c2
+; CHECK-SD-NEXT: mov x11, #23593 // =0x5c29
+; CHECK-SD-NEXT: movk x8, #23592, lsl #16
+; CHECK-SD-NEXT: movk x11, #49807, lsl #16
+; CHECK-SD-NEXT: movk x8, #49807, lsl #32
+; CHECK-SD-NEXT: movk x11, #10485, lsl #32
+; CHECK-SD-NEXT: movk x8, #10485, lsl #48
+; CHECK-SD-NEXT: movk x11, #36700, lsl #48
+; CHECK-SD-NEXT: mul x10, x0, x8
+; CHECK-SD-NEXT: umulh x12, x0, x11
+; CHECK-SD-NEXT: umulh x9, x0, x8
+; CHECK-SD-NEXT: mul x15, x1, x11
+; CHECK-SD-NEXT: adds x10, x12, x10
+; CHECK-SD-NEXT: mov w12, #100 // =0x64
+; CHECK-SD-NEXT: umulh x14, x1, x11
+; CHECK-SD-NEXT: cinc x9, x9, hs
+; CHECK-SD-NEXT: umulh x13, x1, x8
+; CHECK-SD-NEXT: cmn x10, x15
+; CHECK-SD-NEXT: mul x16, x1, x8
+; CHECK-SD-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154968
More information about the llvm-commits
mailing list