[llvm] 3d5d32c - [CGP]: Optimize mul.overflow. (#148343)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 05:15:51 PST 2025
Author: Hassnaa Hamdi
Date: 2025-11-18T13:15:47Z
New Revision: 3d5d32c6058807008e579dd5ea2faced33a7943b
URL: https://github.com/llvm/llvm-project/commit/3d5d32c6058807008e579dd5ea2faced33a7943b
DIFF: https://github.com/llvm/llvm-project/commit/3d5d32c6058807008e579dd5ea2faced33a7943b.diff
LOG: [CGP]: Optimize mul.overflow. (#148343)
- Detect cases where LHS & RHS values will not cause overflow
(when the Hi halfs are zero).
Added:
llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/CodeGenPrepare.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/i128-math.ll
llvm/test/CodeGen/AArch64/i128_with_overflow.ll
llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cec7d09f494d6..4c932c523e423 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3492,6 +3492,13 @@ class LLVM_ABI TargetLoweringBase {
return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
}
+ // Return true if the target wants to optimize the mul overflow intrinsic
+ // for the given \p VT.
+ virtual bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+ EVT VT) const {
+ return false;
+ }
+
// Return true if it is profitable to use a scalar input to a BUILD_VECTOR
// even if the vector itself has multiple uses.
virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index b6dd174f9be80..587c1372b19cb 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -431,6 +431,8 @@ class CodeGenPrepare {
bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
unsigned AddrSpace);
bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+ bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+ ModifyDT &ModifiedDT);
bool optimizeInlineAsmInst(CallInst *CS);
bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
bool optimizeExt(Instruction *&I);
@@ -2797,6 +2799,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
}
}
return false;
+ case Intrinsic::umul_with_overflow:
+ return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT);
+ case Intrinsic::smul_with_overflow:
+ return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT);
}
SmallVector<Value *, 2> PtrOps;
@@ -6391,6 +6397,182 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
return true;
}
+// This is a helper for CodeGenPrepare::optimizeMulWithOverflow.
+// Check the pattern we are interested in where there are maximum 2 uses
+// of the intrinsic which are the extract instructions.
+static bool matchOverflowPattern(Instruction *&I, ExtractValueInst *&MulExtract,
+ ExtractValueInst *&OverflowExtract) {
+ // Bail out if it's more than 2 users:
+ if (I->hasNUsesOrMore(3))
+ return false;
+
+ for (User *U : I->users()) {
+ auto *Extract = dyn_cast<ExtractValueInst>(U);
+ if (!Extract || Extract->getNumIndices() != 1)
+ return false;
+
+ unsigned Index = Extract->getIndices()[0];
+ if (Index == 0)
+ MulExtract = Extract;
+ else if (Index == 1)
+ OverflowExtract = Extract;
+ else
+ return false;
+ }
+ return true;
+}
+
+// Rewrite the mul_with_overflow intrinsic by checking if both of the
+// operands' value ranges are within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+// The IR after the optimization will look like:
+// entry:
+// if signed:
+// ( (lhs_lo>>BW-1) ^ lhs_hi) || ( (rhs_lo>>BW-1) ^ rhs_hi) ? overflow,
+// overflow_no
+// else:
+// (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
+// overflow_no:
+// overflow:
+// overflow.res:
+// \returns true if optimization was applied
+// TODO: This optimization can be further improved to optimize branching on
+// overflow where the 'overflow_no' BB can branch directly to the false
+// successor of overflow, but that would add additional complexity so we leave
+// it for future work.
+bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+ ModifyDT &ModifiedDT) {
+ // Check if target supports this optimization.
+ if (!TLI->shouldOptimizeMulOverflowWithZeroHighBits(
+ I->getContext(),
+ TLI->getValueType(*DL, I->getType()->getContainedType(0))))
+ return false;
+
+ ExtractValueInst *MulExtract = nullptr, *OverflowExtract = nullptr;
+ if (!matchOverflowPattern(I, MulExtract, OverflowExtract))
+ return false;
+
+ // Keep track of the instruction to stop reoptimizing it again.
+ InsertedInsts.insert(I);
+
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ Type *Ty = LHS->getType();
+ unsigned VTHalfBitWidth = Ty->getScalarSizeInBits() / 2;
+ Type *LegalTy = Ty->getWithNewBitWidth(VTHalfBitWidth);
+
+ // New BBs:
+ BasicBlock *OverflowEntryBB =
+ I->getParent()->splitBasicBlock(I, "", /*Before*/ true);
+ OverflowEntryBB->takeName(I->getParent());
+ // Keep the 'br' instruction that is generated as a result of the split to be
+ // erased/replaced later.
+ Instruction *OldTerminator = OverflowEntryBB->getTerminator();
+ BasicBlock *NoOverflowBB =
+ BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction());
+ NoOverflowBB->moveAfter(OverflowEntryBB);
+ BasicBlock *OverflowBB =
+ BasicBlock::Create(I->getContext(), "overflow", I->getFunction());
+ OverflowBB->moveAfter(NoOverflowBB);
+
+ // BB overflow.entry:
+ IRBuilder<> Builder(OverflowEntryBB);
+ // Extract low and high halves of LHS:
+ Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
+ Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+ HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+ // Extract low and high halves of RHS:
+ Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
+ Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+ HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+
+ Value *IsAnyBitTrue;
+ if (IsSigned) {
+ Value *SignLoLHS =
+ Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+ Value *SignLoRHS =
+ Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+ Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
+ Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
+ Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
+ IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or,
+ ConstantInt::getNullValue(Or->getType()));
+ } else {
+ Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+ ConstantInt::getNullValue(LegalTy));
+ Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+ ConstantInt::getNullValue(LegalTy));
+ IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+ }
+ Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB);
+
+ // BB overflow.no:
+ Builder.SetInsertPoint(NoOverflowBB);
+ Value *ExtLoLHS, *ExtLoRHS;
+ if (IsSigned) {
+ ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+ ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+ } else {
+ ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+ ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+ }
+
+ Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
+
+ // Create the 'overflow.res' BB to merge the results of
+ // the two paths:
+ BasicBlock *OverflowResBB = I->getParent();
+ OverflowResBB->setName("overflow.res");
+
+ // BB overflow.no: jump to overflow.res BB
+ Builder.CreateBr(OverflowResBB);
+ // No we don't need the old terminator in overflow.entry BB, erase it:
+ OldTerminator->eraseFromParent();
+
+ // BB overflow.res:
+ Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
+ // Create PHI nodes to merge results from no.overflow BB and overflow BB to
+ // replace the extract instructions.
+ PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2),
+ *OverflowFlagPHI =
+ Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
+
+ // Add the incoming values from no.overflow BB and later from overflow BB.
+ OverflowResPHI->addIncoming(Mul, NoOverflowBB);
+ OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()),
+ NoOverflowBB);
+
+ // Replace all users of MulExtract and OverflowExtract to use the PHI nodes.
+ if (MulExtract) {
+ MulExtract->replaceAllUsesWith(OverflowResPHI);
+ MulExtract->eraseFromParent();
+ }
+ if (OverflowExtract) {
+ OverflowExtract->replaceAllUsesWith(OverflowFlagPHI);
+ OverflowExtract->eraseFromParent();
+ }
+
+ // Remove the intrinsic from parent (overflow.res BB) as it will be part of
+ // overflow BB
+ I->removeFromParent();
+ // BB overflow:
+ I->insertInto(OverflowBB, OverflowBB->end());
+ Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+ Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+ Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
+ Builder.CreateBr(OverflowResBB);
+
+ // Add The Extracted values to the PHINodes in the overflow.res BB.
+ OverflowResPHI->addIncoming(MulOverflow, OverflowBB);
+ OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB);
+
+ ModifiedDT = ModifyDT::ModifyBBDT;
+ return true;
+}
+
/// If there are any memory operands, use OptimizeMemoryInst to sink their
/// address computing into the block when possible / profitable.
bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 42567883b2594..d21e19b2ecd46 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18851,6 +18851,15 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
return (Index == 0 || Index == ResVT.getVectorMinNumElements());
}
+bool AArch64TargetLowering::shouldOptimizeMulOverflowWithZeroHighBits(
+ LLVMContext &Context, EVT VT) const {
+ if (getTypeAction(Context, VT) != TypeExpandInteger)
+ return false;
+
+ EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
+ return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
+}
+
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 70bfae717fb76..be198e54cbcbf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -333,6 +333,11 @@ class AArch64TargetLowering : public TargetLowering {
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
+ // Return true if the target wants to optimize the mul overflow intrinsic
+ // for the given \p VT.
+ bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context,
+ EVT VT) const override;
+
Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9e1c0c1b115ab..12ae241dda4bd 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -262,20 +262,28 @@ define i128 @u128_mul(i128 %x, i128 %y) {
define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: u128_checked_mul:
; CHECK: // %bb.0:
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB17_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: mul x9, x3, x0
; CHECK-NEXT: cmp x1, #0
; CHECK-NEXT: ccmp x3, #0, #4, ne
-; CHECK-NEXT: umulh x8, x1, x2
-; CHECK-NEXT: umulh x10, x3, x0
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x8, x3, x0
; CHECK-NEXT: madd x9, x1, x2, x9
-; CHECK-NEXT: ccmp xzr, x8, #0, eq
-; CHECK-NEXT: umulh x11, x0, x2
; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x11, x0, x2
+; CHECK-NEXT: ccmp xzr, x8, #0, eq
; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: cset w8, ne
; CHECK-NEXT: adds x1, x11, x9
; CHECK-NEXT: csinc w8, w8, wzr, lo
; CHECK-NEXT: eor w2, w8, #0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB17_2: // %overflow.no
+; CHECK-NEXT: umulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: eor w2, w8, #0x1
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
@@ -290,19 +298,27 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
; CHECK-LABEL: u128_overflowing_mul:
; CHECK: // %bb.0:
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB18_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: mul x9, x3, x0
; CHECK-NEXT: cmp x1, #0
; CHECK-NEXT: ccmp x3, #0, #4, ne
-; CHECK-NEXT: umulh x8, x1, x2
-; CHECK-NEXT: umulh x10, x3, x0
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x8, x3, x0
; CHECK-NEXT: madd x9, x1, x2, x9
-; CHECK-NEXT: ccmp xzr, x8, #0, eq
-; CHECK-NEXT: umulh x11, x0, x2
; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x11, x0, x2
+; CHECK-NEXT: ccmp xzr, x8, #0, eq
; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: cset w8, ne
; CHECK-NEXT: adds x1, x11, x9
; CHECK-NEXT: csinc w2, w8, wzr, lo
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB18_2: // %overflow.no
+; CHECK-NEXT: umulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: mov w2, wzr
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
@@ -316,19 +332,28 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
define i128 @u128_saturating_mul(i128 %x, i128 %y) {
; CHECK-LABEL: u128_saturating_mul:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul x9, x3, x0
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB19_2
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: mul x8, x3, x0
; CHECK-NEXT: cmp x1, #0
; CHECK-NEXT: ccmp x3, #0, #4, ne
-; CHECK-NEXT: umulh x8, x1, x2
-; CHECK-NEXT: umulh x10, x3, x0
-; CHECK-NEXT: madd x9, x1, x2, x9
-; CHECK-NEXT: ccmp xzr, x8, #0, eq
-; CHECK-NEXT: umulh x11, x0, x2
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x9, x3, x0
+; CHECK-NEXT: madd x11, x1, x2, x8
; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x12, x0, x2
+; CHECK-NEXT: ccmp xzr, x9, #0, eq
; CHECK-NEXT: mul x8, x0, x2
; CHECK-NEXT: cset w10, ne
-; CHECK-NEXT: adds x9, x11, x9
+; CHECK-NEXT: adds x9, x12, x11
; CHECK-NEXT: csinc w10, w10, wzr, lo
+; CHECK-NEXT: b .LBB19_3
+; CHECK-NEXT: .LBB19_2: // %overflow.no
+; CHECK-NEXT: umulh x9, x0, x2
+; CHECK-NEXT: mov w10, wzr
+; CHECK-NEXT: mul x8, x0, x2
+; CHECK-NEXT: .LBB19_3: // %overflow.res
; CHECK-NEXT: cmp w10, #0
; CHECK-NEXT: csinv x0, x8, xzr, eq
; CHECK-NEXT: csinv x1, x9, xzr, eq
@@ -355,6 +380,11 @@ define i128 @i128_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_checked_mul:
; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cbz x8, .LBB21_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
@@ -364,24 +394,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
-; CHECK-NEXT: adc x9, x8, x9
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mov x9, x1
; CHECK-NEXT: mul x13, x0, x13
-; CHECK-NEXT: adds x8, x14, x10
-; CHECK-NEXT: mul x15, x1, x3
-; CHECK-NEXT: smulh x10, x1, x3
-; CHECK-NEXT: mov x1, x8
-; CHECK-NEXT: adc x11, x12, x13
-; CHECK-NEXT: asr x12, x9, #63
-; CHECK-NEXT: asr x13, x11, #63
-; CHECK-NEXT: adds x9, x9, x11
; CHECK-NEXT: asr x11, x8, #63
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: adds x1, x14, x10
+; CHECK-NEXT: smulh x9, x9, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: asr x10, x1, #63
; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: adc x12, x12, x13
-; CHECK-NEXT: adds x9, x15, x9
-; CHECK-NEXT: adc x10, x10, x12
-; CHECK-NEXT: cmp x9, x11
-; CHECK-NEXT: ccmp x10, x11, #0, eq
-; CHECK-NEXT: cset w2, eq
+; CHECK-NEXT: adc x11, x11, x12
+; CHECK-NEXT: adds x8, x15, x8
+; CHECK-NEXT: adc x9, x9, x11
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x10, #0, eq
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: eor w2, w8, #0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB21_2: // %overflow.no
+; CHECK-NEXT: smulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: eor w2, w8, #0x1
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
@@ -396,6 +432,11 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_overflowing_mul:
; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cbz x8, .LBB22_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
@@ -405,24 +446,29 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
-; CHECK-NEXT: adc x9, x8, x9
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mov x9, x1
; CHECK-NEXT: mul x13, x0, x13
-; CHECK-NEXT: adds x8, x14, x10
-; CHECK-NEXT: mul x15, x1, x3
-; CHECK-NEXT: smulh x10, x1, x3
-; CHECK-NEXT: mov x1, x8
-; CHECK-NEXT: adc x11, x12, x13
-; CHECK-NEXT: asr x12, x9, #63
-; CHECK-NEXT: asr x13, x11, #63
-; CHECK-NEXT: adds x9, x9, x11
; CHECK-NEXT: asr x11, x8, #63
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: adds x1, x14, x10
+; CHECK-NEXT: smulh x9, x9, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: asr x10, x1, #63
; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: adc x12, x12, x13
-; CHECK-NEXT: adds x9, x15, x9
-; CHECK-NEXT: adc x10, x10, x12
-; CHECK-NEXT: cmp x9, x11
-; CHECK-NEXT: ccmp x10, x11, #0, eq
+; CHECK-NEXT: adc x11, x11, x12
+; CHECK-NEXT: adds x8, x15, x8
+; CHECK-NEXT: adc x9, x9, x11
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x10, #0, eq
; CHECK-NEXT: cset w2, ne
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB22_2: // %overflow.no
+; CHECK-NEXT: smulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: mov w2, wzr
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
@@ -436,6 +482,11 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
define i128 @i128_saturating_mul(i128 %x, i128 %y) {
; CHECK-LABEL: i128_saturating_mul:
; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cbz x8, .LBB23_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: asr x9, x1, #63
; CHECK-NEXT: umulh x10, x0, x2
; CHECK-NEXT: asr x13, x3, #63
@@ -445,29 +496,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
; CHECK-NEXT: adds x10, x11, x10
; CHECK-NEXT: mul x14, x0, x3
; CHECK-NEXT: umulh x12, x0, x3
-; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: adc x9, x8, x9
; CHECK-NEXT: mul x13, x0, x13
-; CHECK-NEXT: adds x9, x14, x10
-; CHECK-NEXT: mul x11, x1, x3
-; CHECK-NEXT: adc x10, x12, x13
-; CHECK-NEXT: smulh x12, x1, x3
-; CHECK-NEXT: asr x13, x8, #63
-; CHECK-NEXT: asr x14, x10, #63
-; CHECK-NEXT: adds x8, x8, x10
-; CHECK-NEXT: adc x10, x13, x14
-; CHECK-NEXT: adds x8, x11, x8
-; CHECK-NEXT: asr x11, x9, #63
-; CHECK-NEXT: mul x13, x0, x2
-; CHECK-NEXT: adc x10, x12, x10
-; CHECK-NEXT: eor x12, x3, x1
-; CHECK-NEXT: eor x8, x8, x11
-; CHECK-NEXT: eor x10, x10, x11
-; CHECK-NEXT: asr x11, x12, #63
-; CHECK-NEXT: orr x8, x8, x10
-; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff
-; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csinv x0, x13, x11, eq
-; CHECK-NEXT: csel x1, x10, x9, ne
+; CHECK-NEXT: adds x8, x14, x10
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: asr x14, x8, #63
+; CHECK-NEXT: smulh x10, x1, x3
+; CHECK-NEXT: adc x11, x12, x13
+; CHECK-NEXT: asr x12, x9, #63
+; CHECK-NEXT: asr x13, x11, #63
+; CHECK-NEXT: adds x11, x9, x11
+; CHECK-NEXT: mul x9, x0, x2
+; CHECK-NEXT: adc x12, x12, x13
+; CHECK-NEXT: adds x11, x15, x11
+; CHECK-NEXT: adc x10, x10, x12
+; CHECK-NEXT: cmp x11, x14
+; CHECK-NEXT: ccmp x10, x14, #0, eq
+; CHECK-NEXT: cset w10, ne
+; CHECK-NEXT: b .LBB23_3
+; CHECK-NEXT: .LBB23_2: // %overflow.no
+; CHECK-NEXT: smulh x8, x0, x2
+; CHECK-NEXT: mov w10, wzr
+; CHECK-NEXT: mul x9, x0, x2
+; CHECK-NEXT: .LBB23_3: // %overflow.res
+; CHECK-NEXT: eor x11, x3, x1
+; CHECK-NEXT: cmp w10, #0
+; CHECK-NEXT: asr x11, x11, #63
+; CHECK-NEXT: eor x12, x11, #0x7fffffffffffffff
+; CHECK-NEXT: csinv x0, x9, x11, eq
+; CHECK-NEXT: csel x1, x12, x8, ne
; CHECK-NEXT: ret
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 9924b7c63f763..3d90e094a5747 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -224,21 +224,29 @@ cleanup:
define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
; CHECK-LABEL: test_umul_i128:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB4_2
+; CHECK-NEXT: // %bb.1: // %overflow
; CHECK-NEXT: mul x9, x3, x0
; CHECK-NEXT: cmp x1, #0
; CHECK-NEXT: ccmp x3, #0, #4, ne
-; CHECK-NEXT: umulh x8, x1, x2
-; CHECK-NEXT: umulh x10, x3, x0
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x8, x3, x0
; CHECK-NEXT: madd x9, x1, x2, x9
-; CHECK-NEXT: ccmp xzr, x8, #0, eq
-; CHECK-NEXT: umulh x11, x0, x2
; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x11, x0, x2
+; CHECK-NEXT: ccmp xzr, x8, #0, eq
+; CHECK-NEXT: mul x0, x0, x2
; CHECK-NEXT: cset w8, ne
; CHECK-NEXT: adds x1, x11, x9
; CHECK-NEXT: csinc w8, w8, wzr, lo
-; CHECK-NEXT: cmp w8, #1
-; CHECK-NEXT: b.ne .LBB4_2
-; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: cbnz w8, .LBB4_3
+; CHECK-NEXT: b .LBB4_4
+; CHECK-NEXT: .LBB4_2: // %overflow.no
+; CHECK-NEXT: umulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: cbz w8, .LBB4_4
+; CHECK-NEXT: .LBB4_3: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
@@ -247,9 +255,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
; CHECK-NEXT: sxtw x0, w0
; CHECK-NEXT: asr x1, x0, #63
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB4_2: // %if.end
-; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: .LBB4_4: // %cleanup
; CHECK-NEXT: ret
entry:
%0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
@@ -273,34 +279,40 @@ cleanup:
define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
; CHECK-LABEL: test_smul_i128:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: asr x10, x1, #63
-; CHECK-NEXT: umulh x11, x0, x2
-; CHECK-NEXT: asr x14, x3, #63
-; CHECK-NEXT: mov x8, x1
-; CHECK-NEXT: mul x12, x1, x2
-; CHECK-NEXT: umulh x9, x1, x2
-; CHECK-NEXT: mul x10, x10, x2
-; CHECK-NEXT: adds x11, x12, x11
-; CHECK-NEXT: mul x15, x0, x3
-; CHECK-NEXT: umulh x13, x0, x3
-; CHECK-NEXT: adc x9, x9, x10
-; CHECK-NEXT: mul x14, x0, x14
-; CHECK-NEXT: mul x16, x1, x3
-; CHECK-NEXT: adds x1, x15, x11
-; CHECK-NEXT: asr x11, x9, #63
-; CHECK-NEXT: smulh x8, x8, x3
-; CHECK-NEXT: adc x10, x13, x14
-; CHECK-NEXT: asr x12, x10, #63
-; CHECK-NEXT: adds x9, x9, x10
-; CHECK-NEXT: adc x10, x11, x12
-; CHECK-NEXT: adds x9, x16, x9
-; CHECK-NEXT: asr x11, x1, #63
-; CHECK-NEXT: adc x8, x8, x10
-; CHECK-NEXT: eor x8, x8, x11
-; CHECK-NEXT: eor x9, x9, x11
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: cbz x8, .LBB5_2
-; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: cbz x8, .LBB5_4
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mov x9, x1
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: asr x11, x8, #63
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: adds x1, x14, x10
+; CHECK-NEXT: smulh x9, x9, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: asr x10, x1, #63
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: adc x11, x11, x12
+; CHECK-NEXT: adds x8, x15, x8
+; CHECK-NEXT: adc x9, x9, x11
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x10, #0, eq
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: cbz w8, .LBB5_3
+; CHECK-NEXT: .LBB5_2: // %if.then
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
@@ -309,10 +321,13 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
; CHECK-NEXT: sxtw x0, w0
; CHECK-NEXT: asr x1, x0, #63
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB5_3: // %cleanup
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB5_2: // %if.end
+; CHECK-NEXT: .LBB5_4: // %overflow.no
+; CHECK-NEXT: smulh x1, x0, x2
; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: ret
+; CHECK-NEXT: cbnz w8, .LBB5_2
+; CHECK-NEXT: b .LBB5_3
entry:
%0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
%1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
new file mode 100644
index 0000000000000..7b60f81539aa8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s
+
+
+declare i32 @error()
+
+define i128 @test1(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: test1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cbz x8, .LBB0_4
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mov x9, x1
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: asr x11, x8, #63
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: adds x1, x14, x10
+; CHECK-NEXT: smulh x9, x9, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: asr x10, x1, #63
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: adc x11, x11, x12
+; CHECK-NEXT: adds x8, x15, x8
+; CHECK-NEXT: adc x9, x9, x11
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x10, #0, eq
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: cbz w8, .LBB0_3
+; CHECK-NEXT: .LBB0_2: // %if.then
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl error
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x0, w0
+; CHECK-NEXT: asr x1, x0, #63
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB0_3: // %cleanup
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_4: // %overflow.no
+; CHECK-NEXT: smulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: cbnz w8, .LBB0_2
+; CHECK-NEXT: b .LBB0_3
+entry:
+ %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+ %1 = extractvalue { i128, i1 } %0, 1
+ br i1 %1, label %if.then, label %if.end
+
+if.then:
+ %call = tail call i32 @error()
+ %conv1 = sext i32 %call to i128
+ br label %cleanup
+
+if.end:
+ %2 = extractvalue { i128, i1 } %0, 0
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
+ ret i128 %retval.0
+}
+
+define i128 @test2(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: eor x8, x3, x2, asr #63
+; CHECK-NEXT: eor x9, x1, x0, asr #63
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cbz x8, .LBB1_4
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: asr x9, x1, #63
+; CHECK-NEXT: umulh x10, x0, x2
+; CHECK-NEXT: asr x13, x3, #63
+; CHECK-NEXT: mul x11, x1, x2
+; CHECK-NEXT: umulh x8, x1, x2
+; CHECK-NEXT: mul x9, x9, x2
+; CHECK-NEXT: adds x10, x11, x10
+; CHECK-NEXT: mul x14, x0, x3
+; CHECK-NEXT: umulh x12, x0, x3
+; CHECK-NEXT: adc x8, x8, x9
+; CHECK-NEXT: mov x9, x1
+; CHECK-NEXT: mul x13, x0, x13
+; CHECK-NEXT: asr x11, x8, #63
+; CHECK-NEXT: mul x15, x1, x3
+; CHECK-NEXT: adds x1, x14, x10
+; CHECK-NEXT: smulh x9, x9, x3
+; CHECK-NEXT: adc x10, x12, x13
+; CHECK-NEXT: asr x12, x10, #63
+; CHECK-NEXT: adds x8, x8, x10
+; CHECK-NEXT: asr x10, x1, #63
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: adc x11, x11, x12
+; CHECK-NEXT: adds x8, x15, x8
+; CHECK-NEXT: adc x9, x9, x11
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x10, #0, eq
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: stp x0, x1, [x4]
+; CHECK-NEXT: cbz w8, .LBB1_3
+; CHECK-NEXT: .LBB1_2: // %if.then
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl error
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x0, w0
+; CHECK-NEXT: asr x1, x0, #63
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB1_3: // %cleanup
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_4: // %overflow.no
+; CHECK-NEXT: smulh x1, x0, x2
+; CHECK-NEXT: mul x0, x0, x2
+; CHECK-NEXT: stp x0, x1, [x4]
+; CHECK-NEXT: cbnz w8, .LBB1_2
+; CHECK-NEXT: b .LBB1_3
+entry:
+ %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+ %1 = extractvalue { i128, i1 } %0, 0
+ store i128 %1, ptr %out
+ %2 = extractvalue { i128, i1 } %0, 1
+ br i1 %2, label %if.then, label %cleanup
+
+if.then:
+ %call = tail call i32 @error()
+ %conv1 = sext i32 %call to i128
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i128 [ %conv1, %if.then ], [ %1, %entry ]
+ ret i128 %retval.0
+}
+
+define i128 @test3(i128 noundef %x, i128 noundef %y, ptr %out) {
+; CHECK-LABEL: test3:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB2_3
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: mul x8, x3, x0
+; CHECK-NEXT: cmp x1, #0
+; CHECK-NEXT: ccmp x3, #0, #4, ne
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x9, x3, x0
+; CHECK-NEXT: madd x11, x1, x2, x8
+; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x12, x0, x2
+; CHECK-NEXT: ccmp xzr, x9, #0, eq
+; CHECK-NEXT: mul x8, x0, x2
+; CHECK-NEXT: cset w10, ne
+; CHECK-NEXT: adds x9, x12, x11
+; CHECK-NEXT: csinc w10, w10, wzr, lo
+; CHECK-NEXT: stp x8, x9, [x4]
+; CHECK-NEXT: cbnz w10, .LBB2_4
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: mov x1, xzr
+; CHECK-NEXT: mov w0, #1 // =0x1
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB2_3: // %overflow.no
+; CHECK-NEXT: umulh x9, x0, x2
+; CHECK-NEXT: mov w10, wzr
+; CHECK-NEXT: mul x8, x0, x2
+; CHECK-NEXT: stp x8, x9, [x4]
+; CHECK-NEXT: cbz w10, .LBB2_2
+; CHECK-NEXT: .LBB2_4: // %if.then
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl error
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x0, w0
+; CHECK-NEXT: asr x1, x0, #63
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+ %1 = extractvalue { i128, i1 } %0, 0
+ store i128 %1, ptr %out
+ %2 = extractvalue { i128, i1 } %0, 1
+ br i1 %2, label %if.then, label %cleanup
+
+if.then:
+ %call = tail call i32 @error()
+ %conv1 = sext i32 %call to i128
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i128 [ %conv1, %if.then ], [ 1, %entry ]
+ ret i128 %retval.0
+}
+
+define i128 @test4(i128 noundef %x, i128 noundef %y, i128 %out) {
+; CHECK-LABEL: test4:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: orr x8, x1, x3
+; CHECK-NEXT: cbz x8, .LBB3_2
+; CHECK-NEXT: // %bb.1: // %overflow
+; CHECK-NEXT: mul x8, x3, x0
+; CHECK-NEXT: cmp x1, #0
+; CHECK-NEXT: ccmp x3, #0, #4, ne
+; CHECK-NEXT: umulh x10, x1, x2
+; CHECK-NEXT: umulh x9, x3, x0
+; CHECK-NEXT: madd x11, x1, x2, x8
+; CHECK-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-NEXT: umulh x12, x0, x2
+; CHECK-NEXT: ccmp xzr, x9, #0, eq
+; CHECK-NEXT: mul x8, x0, x2
+; CHECK-NEXT: cset w10, ne
+; CHECK-NEXT: adds x9, x12, x11
+; CHECK-NEXT: csinc w10, w10, wzr, lo
+; CHECK-NEXT: b .LBB3_3
+; CHECK-NEXT: .LBB3_2: // %overflow.no
+; CHECK-NEXT: umulh x9, x0, x2
+; CHECK-NEXT: mov w10, wzr
+; CHECK-NEXT: mul x8, x0, x2
+; CHECK-NEXT: .LBB3_3: // %overflow.res
+; CHECK-NEXT: adds x0, x8, x4
+; CHECK-NEXT: adc x1, x9, x5
+; CHECK-NEXT: cbz w10, .LBB3_5
+; CHECK-NEXT: // %bb.4: // %if.then
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl error
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sxtw x0, w0
+; CHECK-NEXT: asr x1, x0, #63
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB3_5: // %cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+ %1 = extractvalue { i128, i1 } %0, 0
+ %res = add i128 %1, %out
+ %2 = extractvalue { i128, i1 } %0, 1
+ br i1 %2, label %if.then, label %cleanup
+
+if.then:
+ %call = tail call i32 @error()
+ %conv1 = sext i32 %call to i128
+ br label %cleanup
+
+cleanup:
+ %retval.0 = phi i128 [ %conv1, %if.then ], [ %res, %entry ]
+ ret i128 %retval.0
+}
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index edfd80b4f2706..ace0c83e63c7c 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -4,20 +4,28 @@
define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; AARCH-LABEL: muloti_test:
; AARCH: // %bb.0: // %start
+; AARCH-NEXT: orr x8, x1, x3
+; AARCH-NEXT: cbz x8, .LBB0_2
+; AARCH-NEXT: // %bb.1: // %overflow
; AARCH-NEXT: mul x9, x3, x0
; AARCH-NEXT: cmp x1, #0
; AARCH-NEXT: ccmp x3, #0, #4, ne
-; AARCH-NEXT: umulh x8, x1, x2
-; AARCH-NEXT: umulh x10, x3, x0
+; AARCH-NEXT: umulh x10, x1, x2
+; AARCH-NEXT: umulh x8, x3, x0
; AARCH-NEXT: madd x9, x1, x2, x9
-; AARCH-NEXT: ccmp xzr, x8, #0, eq
-; AARCH-NEXT: umulh x11, x0, x2
; AARCH-NEXT: ccmp xzr, x10, #0, eq
+; AARCH-NEXT: umulh x11, x0, x2
+; AARCH-NEXT: ccmp xzr, x8, #0, eq
; AARCH-NEXT: mul x0, x0, x2
; AARCH-NEXT: cset w8, ne
; AARCH-NEXT: adds x1, x11, x9
; AARCH-NEXT: csinc w2, w8, wzr, lo
; AARCH-NEXT: ret
+; AARCH-NEXT: .LBB0_2: // %overflow.no
+; AARCH-NEXT: umulh x1, x0, x2
+; AARCH-NEXT: mul x0, x0, x2
+; AARCH-NEXT: mov w2, wzr
+; AARCH-NEXT: ret
start:
%0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
%1 = extractvalue { i128, i1 } %0, 0
@@ -35,45 +43,56 @@ start:
define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
; AARCH-LABEL: __muloti4:
; AARCH: // %bb.0: // %Entry
-; AARCH-NEXT: asr x11, x1, #63
-; AARCH-NEXT: asr x9, x3, #63
-; AARCH-NEXT: umulh x12, x0, x2
-; AARCH-NEXT: mov x8, x1
+; AARCH-NEXT: eor x8, x3, x2, asr #63
+; AARCH-NEXT: eor x9, x1, x0, asr #63
; AARCH-NEXT: str wzr, [x4]
-; AARCH-NEXT: mul x13, x1, x2
-; AARCH-NEXT: umulh x10, x1, x2
-; AARCH-NEXT: mul x11, x11, x2
-; AARCH-NEXT: adds x12, x13, x12
-; AARCH-NEXT: mul x15, x0, x3
-; AARCH-NEXT: umulh x14, x0, x3
-; AARCH-NEXT: adc x10, x10, x11
-; AARCH-NEXT: mul x9, x0, x9
-; AARCH-NEXT: mul x16, x1, x3
-; AARCH-NEXT: adds x1, x15, x12
-; AARCH-NEXT: asr x12, x10, #63
-; AARCH-NEXT: smulh x11, x8, x3
-; AARCH-NEXT: adc x9, x14, x9
-; AARCH-NEXT: asr x13, x9, #63
-; AARCH-NEXT: adds x9, x10, x9
-; AARCH-NEXT: asr x10, x1, #63
+; AARCH-NEXT: orr x8, x9, x8
+; AARCH-NEXT: cbz x8, .LBB1_2
+; AARCH-NEXT: // %bb.1: // %overflow
+; AARCH-NEXT: asr x9, x1, #63
+; AARCH-NEXT: umulh x10, x0, x2
+; AARCH-NEXT: asr x13, x3, #63
+; AARCH-NEXT: mul x11, x1, x2
+; AARCH-NEXT: umulh x8, x1, x2
+; AARCH-NEXT: mul x9, x9, x2
+; AARCH-NEXT: adds x10, x11, x10
+; AARCH-NEXT: mul x14, x0, x3
+; AARCH-NEXT: umulh x12, x0, x3
+; AARCH-NEXT: adc x9, x8, x9
+; AARCH-NEXT: mul x13, x0, x13
+; AARCH-NEXT: adds x8, x14, x10
+; AARCH-NEXT: mul x15, x1, x3
+; AARCH-NEXT: smulh x10, x1, x3
+; AARCH-NEXT: adc x11, x12, x13
+; AARCH-NEXT: asr x12, x9, #63
+; AARCH-NEXT: asr x13, x11, #63
+; AARCH-NEXT: adds x9, x9, x11
+; AARCH-NEXT: asr x11, x8, #63
; AARCH-NEXT: mul x0, x0, x2
; AARCH-NEXT: adc x12, x12, x13
-; AARCH-NEXT: adds x9, x16, x9
-; AARCH-NEXT: adc x11, x11, x12
-; AARCH-NEXT: cmp x9, x10
-; AARCH-NEXT: ccmp x11, x10, #0, eq
+; AARCH-NEXT: adds x9, x15, x9
+; AARCH-NEXT: adc x10, x10, x12
+; AARCH-NEXT: cmp x9, x11
+; AARCH-NEXT: ccmp x10, x11, #0, eq
; AARCH-NEXT: cset w9, ne
-; AARCH-NEXT: tbz x8, #63, .LBB1_2
-; AARCH-NEXT: // %bb.1: // %Entry
-; AARCH-NEXT: eor x8, x3, #0x8000000000000000
-; AARCH-NEXT: orr x8, x2, x8
-; AARCH-NEXT: cbz x8, .LBB1_3
-; AARCH-NEXT: .LBB1_2: // %Else2
-; AARCH-NEXT: cbz w9, .LBB1_4
-; AARCH-NEXT: .LBB1_3: // %Then7
-; AARCH-NEXT: mov w8, #1 // =0x1
-; AARCH-NEXT: str w8, [x4]
-; AARCH-NEXT: .LBB1_4: // %Block9
+; AARCH-NEXT: tbnz x1, #63, .LBB1_3
+; AARCH-NEXT: b .LBB1_4
+; AARCH-NEXT: .LBB1_2: // %overflow.no
+; AARCH-NEXT: smulh x8, x0, x2
+; AARCH-NEXT: mov w9, wzr
+; AARCH-NEXT: mul x0, x0, x2
+; AARCH-NEXT: tbz x1, #63, .LBB1_4
+; AARCH-NEXT: .LBB1_3: // %overflow.res
+; AARCH-NEXT: eor x10, x3, #0x8000000000000000
+; AARCH-NEXT: orr x10, x2, x10
+; AARCH-NEXT: cbz x10, .LBB1_5
+; AARCH-NEXT: .LBB1_4: // %Else2
+; AARCH-NEXT: cbz w9, .LBB1_6
+; AARCH-NEXT: .LBB1_5: // %Then7
+; AARCH-NEXT: mov w9, #1 // =0x1
+; AARCH-NEXT: str w9, [x4]
+; AARCH-NEXT: .LBB1_6: // %Block9
+; AARCH-NEXT: mov x1, x8
; AARCH-NEXT: ret
Entry:
store i32 0, ptr %2, align 4
More information about the llvm-commits
mailing list