[llvm] [AArch64][GISel] Add legalizer support for @llvm.umul.with.overflow.i128 (PR #170101)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 19:32:46 PST 2025
https://github.com/ayank227 updated https://github.com/llvm/llvm-project/pull/170101
>From 345c2041f7424e755d555e80cfef79c3494bf211 Mon Sep 17 00:00:00 2001
From: Ayan Kundu <ayank at nvidia.com>
Date: Fri, 28 Nov 2025 08:47:54 +0000
Subject: [PATCH] [AArch64][GISel] Add legalizer support for
@llvm.umul.with.overflow.i128
This matches the similar pattern how SelectionDAG handles this.
---
.../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 27 +-
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 327 +++++++++++-------
.../AArch64/GlobalISel/arm64-fallback.ll | 13 -
.../CodeGen/AArch64/i128_with_overflow.ll | 119 +++++--
4 files changed, 305 insertions(+), 181 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index a458cbd94ccb1..56f16f9bfe96a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -179,14 +179,14 @@ class LegalizerHelper {
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx);
private:
- LegalizeResult
- widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
- LegalizeResult
- widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
- LegalizeResult
- widenScalarExtract(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
- LegalizeResult
- widenScalarInsert(MachineInstr &MI, unsigned TypeIdx, LLT WideTy);
+ LegalizeResult widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
+ LLT WideTy);
+ LegalizeResult widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
+ LLT WideTy);
+ LegalizeResult widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
+ LLT WideTy);
+ LegalizeResult widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
+ LLT WideTy);
LegalizeResult widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
LLT WideTy);
LegalizeResult widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
@@ -203,9 +203,9 @@ class LegalizerHelper {
///
/// If \p ResultTy does not evenly break into \p PartTy sized pieces, the
/// remainder must be specified with \p LeftoverRegs of type \p LeftoverTy.
- void insertParts(Register DstReg, LLT ResultTy,
- LLT PartTy, ArrayRef<Register> PartRegs,
- LLT LeftoverTy = LLT(), ArrayRef<Register> LeftoverRegs = {});
+ void insertParts(Register DstReg, LLT ResultTy, LLT PartTy,
+ ArrayRef<Register> PartRegs, LLT LeftoverTy = LLT(),
+ ArrayRef<Register> LeftoverRegs = {});
/// Merge \p PartRegs with different types into \p DstReg.
void mergeMixedSubvectors(Register DstReg, ArrayRef<Register> PartRegs);
@@ -216,8 +216,8 @@ class LegalizerHelper {
/// Parts. The elements of \p Parts will be the greatest common divisor type
/// of \p DstTy, \p NarrowTy and the type of \p SrcReg. This will compute and
/// return the GCD type.
- LLT extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
- LLT NarrowTy, Register SrcReg);
+ LLT extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy, LLT NarrowTy,
+ Register SrcReg);
/// Unmerge \p SrcReg into \p GCDTy typed registers. This will append all of
/// the unpacked registers to \p Parts. This version is if the common unmerge
@@ -426,6 +426,7 @@ class LegalizerHelper {
LLVM_ABI LegalizeResult narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
LLT NarrowTy);
LLVM_ABI LegalizeResult narrowScalarMul(MachineInstr &MI, LLT Ty);
+ LLVM_ABI LegalizeResult narrowScalarMULO(MachineInstr &MI, LLT Ty);
LLVM_ABI LegalizeResult narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
LLT Ty);
LLVM_ABI LegalizeResult narrowScalarExtract(MachineInstr &MI,
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 1aa1d465d8da6..9c1e063322963 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -53,8 +53,8 @@ using namespace MIPatternMatch;
///
/// Returns -1 in the first element of the pair if the breakdown is not
/// satisfiable.
-static std::pair<int, int>
-getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
+static std::pair<int, int> getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy,
+ LLT &LeftoverTy) {
assert(!LeftoverTy.isValid() && "this is an out argument");
unsigned Size = OrigTy.getSizeInBits();
@@ -160,10 +160,8 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
}
}
-void LegalizerHelper::insertParts(Register DstReg,
- LLT ResultTy, LLT PartTy,
- ArrayRef<Register> PartRegs,
- LLT LeftoverTy,
+void LegalizerHelper::insertParts(Register DstReg, LLT ResultTy, LLT PartTy,
+ ArrayRef<Register> PartRegs, LLT LeftoverTy,
ArrayRef<Register> LeftoverRegs) {
if (!LeftoverTy.isValid()) {
assert(LeftoverRegs.empty());
@@ -278,7 +276,7 @@ LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
// Shift the sign bit of the low register through the high register.
auto ShiftAmt =
- MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
+ MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
}
}
@@ -847,7 +845,8 @@ static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
if (MemType.isVector())
return RTLIB::UNKNOWN_LIBCALL;
-#define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
+#define LCALLS(A, B) \
+ { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
#define LCALL5(A) \
LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
switch (Opc) {
@@ -1002,8 +1001,8 @@ conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
&MI);
}
-static RTLIB::Libcall
-getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
+static RTLIB::Libcall getStateLibraryFunctionFor(MachineInstr &MI,
+ const TargetLowering &TLI) {
RTLIB::Libcall RTLibcall;
switch (MI.getOpcode()) {
case TargetOpcode::G_GET_FPENV:
@@ -1440,8 +1439,10 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
}
case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC: {
- Type *FromTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
- Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
+ Type *FromTy =
+ getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
+ Type *ToTy =
+ getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
if (!FromTy || !ToTy)
return UnableToLegalize;
LegalizeResult Status =
@@ -1603,13 +1604,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
if (LeftoverBits != 0) {
LeftoverTy = LLT::scalar(LeftoverBits);
auto K = MIRBuilder.buildConstant(
- LeftoverTy,
- Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
+ LeftoverTy, Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
LeftoverRegs.push_back(K.getReg(0));
}
- insertParts(MI.getOperand(0).getReg(),
- Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
+ insertParts(MI.getOperand(0).getReg(), Ty, NarrowTy, PartRegs, LeftoverTy,
+ LeftoverRegs);
MI.eraseFromParent();
return Legalized;
@@ -1669,6 +1669,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
case TargetOpcode::G_MUL:
case TargetOpcode::G_UMULH:
return narrowScalarMul(MI, NarrowTy);
+ case TargetOpcode::G_UMULO:
+ return narrowScalarMULO(MI, NarrowTy);
case TargetOpcode::G_EXTRACT:
return narrowScalarExtract(MI, TypeIdx, NarrowTy);
case TargetOpcode::G_INSERT:
@@ -2230,8 +2232,9 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
- Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
- MRI.createGenericVirtualRegister(WideTy);
+ Register NextResult = I + 1 == NumOps && WideTy == DstTy
+ ? DstReg
+ : MRI.createGenericVirtualRegister(WideTy);
auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
@@ -2475,8 +2478,7 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
if (Offset == 0) {
// Avoid a shift in the degenerate case.
- MIRBuilder.buildTrunc(DstReg,
- MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
+ MIRBuilder.buildTrunc(DstReg, MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
MI.eraseFromParent();
return Legalized;
}
@@ -2488,8 +2490,8 @@ LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
ShiftTy = WideTy;
}
- auto LShr = MIRBuilder.buildLShr(
- ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
+ auto LShr = MIRBuilder.buildLShr(ShiftTy, Src,
+ MIRBuilder.buildConstant(ShiftTy, Offset));
MIRBuilder.buildTrunc(DstReg, LShr);
MI.eraseFromParent();
return Legalized;
@@ -2827,8 +2829,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
// the top of the original type.
auto TopBit =
APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
- MIBSrc = MIRBuilder.buildOr(
- WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
+ MIBSrc = MIRBuilder.buildOr(WideTy, MIBSrc,
+ MIRBuilder.buildConstant(WideTy, TopBit));
// Now we know the operand is non-zero, use the more relaxed opcode.
NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
}
@@ -3177,8 +3179,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Observer.changingInstr(MI);
- unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
- TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
+ unsigned ExtType = Ty.getScalarSizeInBits() == 1 ? TargetOpcode::G_ZEXT
+ : TargetOpcode::G_ANYEXT;
widenScalarSrc(MI, WideTy, 0, ExtType);
Observer.changedInstr(MI);
@@ -3714,8 +3716,9 @@ static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
auto OffsetMask = B.buildConstant(
IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
- return B.buildShl(IdxTy, OffsetIdx,
- B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
+ return B
+ .buildShl(IdxTy, OffsetIdx, B.buildConstant(IdxTy, Log2_32(OldEltSize)))
+ .getReg(0);
}
/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
@@ -3768,7 +3771,8 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
- auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
+ auto Elt =
+ MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
NewOps[I] = Elt.getReg(0);
}
@@ -3809,13 +3813,14 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
Register WideElt = CastVec;
if (CastTy.isVector()) {
- WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
- ScaledIdx).getReg(0);
+ WideElt =
+ MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, ScaledIdx)
+ .getReg(0);
}
// Compute the bit offset into the register of the target element.
Register OffsetBits = getBitcastWiderVectorElementOffset(
- MIRBuilder, Idx, NewEltSize, OldEltSize);
+ MIRBuilder, Idx, NewEltSize, OldEltSize);
// Shift the wide element to get the target element.
auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
@@ -3831,18 +3836,17 @@ LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
/// TargetReg, while preserving other bits in \p TargetReg.
///
/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
-static Register buildBitFieldInsert(MachineIRBuilder &B,
- Register TargetReg, Register InsertReg,
- Register OffsetBits) {
+static Register buildBitFieldInsert(MachineIRBuilder &B, Register TargetReg,
+ Register InsertReg, Register OffsetBits) {
LLT TargetTy = B.getMRI()->getType(TargetReg);
LLT InsertTy = B.getMRI()->getType(InsertReg);
auto ZextVal = B.buildZExt(TargetTy, InsertReg);
auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
// Produce a bitmask of the value to insert
- auto EltMask = B.buildConstant(
- TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
- InsertTy.getSizeInBits()));
+ auto EltMask =
+ B.buildConstant(TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
+ InsertTy.getSizeInBits()));
// Shift it into position
auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
@@ -3897,19 +3901,22 @@ LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
Register ExtractedElt = CastVec;
if (CastTy.isVector()) {
- ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
- ScaledIdx).getReg(0);
+ ExtractedElt =
+ MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, ScaledIdx)
+ .getReg(0);
}
// Compute the bit offset into the register of the target element.
Register OffsetBits = getBitcastWiderVectorElementOffset(
- MIRBuilder, Idx, NewEltSize, OldEltSize);
+ MIRBuilder, Idx, NewEltSize, OldEltSize);
- Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
- Val, OffsetBits);
+ Register InsertedElt =
+ buildBitFieldInsert(MIRBuilder, ExtractedElt, Val, OffsetBits);
if (CastTy.isVector()) {
- InsertedElt = MIRBuilder.buildInsertVectorElement(
- CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
+ InsertedElt =
+ MIRBuilder
+ .buildInsertVectorElement(CastTy, CastVec, InsertedElt, ScaledIdx)
+ .getReg(0);
}
MIRBuilder.buildBitcast(Dst, InsertedElt);
@@ -4365,14 +4372,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
// Generate the PtrAdd and truncating stores.
LLT PtrTy = MRI.getType(PtrReg);
- auto OffsetCst = MIRBuilder.buildConstant(
- LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
+ auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
+ LargeSplitSize / 8);
auto SmallPtr = MIRBuilder.buildObjectPtrOffset(PtrTy, PtrReg, OffsetCst);
MachineMemOperand *LargeMMO =
- MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+ MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
MachineMemOperand *SmallMMO =
- MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+ MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
StoreMI.eraseFromParent();
@@ -4506,16 +4513,16 @@ LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
// Legalize an instruction by changing the opcode in place.
void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
- Observer.changingInstr(MI);
- MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
- Observer.changedInstr(MI);
+ Observer.changingInstr(MI);
+ MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
+ Observer.changedInstr(MI);
}
LegalizerHelper::LegalizeResult
LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
using namespace TargetOpcode;
- switch(MI.getOpcode()) {
+ switch (MI.getOpcode()) {
default:
return UnableToLegalize;
case TargetOpcode::G_FCONSTANT:
@@ -4783,7 +4790,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
LLT DstTy = MRI.getType(DstReg);
Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
- auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
+ auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() -
+ SizeInBits);
MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
MI.eraseFromParent();
@@ -4876,7 +4884,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
return lowerEXT(MI);
case G_TRUNC:
return lowerTRUNC(MI);
- GISEL_VECREDUCE_CASES_NONSEQ
+ GISEL_VECREDUCE_CASES_NONSEQ
return lowerVectorReduction(MI);
case G_VAARG:
return lowerVAArg(MI);
@@ -5439,7 +5447,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
LLT LeftoverTy;
SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
if (IsLoad) {
- std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
+ std::tie(NumParts, NumLeftover) =
+ getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
} else {
if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
NarrowLeftoverRegs, MIRBuilder, MRI)) {
@@ -5499,8 +5508,8 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
if (IsLoad) {
- insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
- LeftoverTy, NarrowLeftoverRegs);
+ insertParts(ValReg, ValTy, NarrowTy, NarrowRegs, LeftoverTy,
+ NarrowLeftoverRegs);
}
LdStMI.eraseFromParent();
@@ -5667,7 +5676,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
case G_SEXT_INREG:
return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
- GISEL_VECREDUCE_CASES_NONSEQ
+ GISEL_VECREDUCE_CASES_NONSEQ
return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
case TargetOpcode::G_VECREDUCE_SEQ_FADD:
case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
@@ -5991,7 +6000,7 @@ LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
// one NarrowTy size value left.
while (SplitSrcs.size() > 1) {
SmallVector<Register> PartialRdxs;
- for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
+ for (unsigned Idx = 0; Idx < SplitSrcs.size() - 1; Idx += 2) {
Register LHS = SplitSrcs[Idx];
Register RHS = SplitSrcs[Idx + 1];
// Create the intermediate vector op.
@@ -6008,9 +6017,8 @@ LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
return Legalized;
}
-LegalizerHelper::LegalizeResult
-LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
- const LLT HalfTy, const LLT AmtTy) {
+LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalarShiftByConstant(
+ MachineInstr &MI, const APInt &Amt, const LLT HalfTy, const LLT AmtTy) {
Register InL = MRI.createGenericVirtualRegister(HalfTy);
Register InH = MRI.createGenericVirtualRegister(HalfTy);
@@ -6200,13 +6208,13 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
// Long: ShAmt >= NewBitSize
MachineInstrBuilder HiL;
if (MI.getOpcode() == TargetOpcode::G_LSHR) {
- HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero.
+ HiL = MIRBuilder.buildConstant(HalfTy, 0); // Hi part is zero.
} else {
auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
- HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part.
+ HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt); // Sign of Hi part.
}
auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
- {InH, AmtExcess}); // Lo from Hi part.
+ {InH, AmtExcess}); // Lo from Hi part.
auto Lo = MIRBuilder.buildSelect(
HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
@@ -7202,6 +7210,92 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
return Legalized;
}
+// Narrow unsigned multiplication with overflow (G_UMULO).
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarMULO(MachineInstr &MI, LLT NarrowTy) {
+ auto [DstReg, OverflowReg, Src1, Src2] = MI.getFirst4Regs();
+
+ LLT Ty = MRI.getType(DstReg);
+ if (Ty.isVector())
+ return UnableToLegalize;
+
+ unsigned Size = Ty.getSizeInBits();
+ unsigned NarrowSize = NarrowTy.getSizeInBits();
+ if (Size % NarrowSize != 0)
+ return UnableToLegalize;
+
+ unsigned NumParts = Size / NarrowSize;
+ if (NumParts != 2)
+ return UnableToLegalize; // Only handle i128→i64 narrowing
+
+ // Split inputs into high/low parts
+ SmallVector<Register, 2> Src1Parts, Src2Parts;
+ extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
+ extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
+
+ Register LHSLo = Src1Parts[0];
+ Register LHSHi = Src1Parts[1];
+ Register RHSLo = Src2Parts[0];
+ Register RHSHi = Src2Parts[1];
+
+ // Check if both high parts are non-zero → guaranteed overflow
+ auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
+ auto LHSHiNZ =
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), LHSHi, Zero);
+ auto RHSHiNZ =
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), RHSHi, Zero);
+ auto BothHiNonZero = MIRBuilder.buildAnd(LLT::scalar(1), LHSHiNZ, RHSHiNZ);
+
+ // Cross multiply LHSHi × RHSLo with overflow (use MUL+UMULH directly)
+ auto Mid1 = MIRBuilder.buildMul(NarrowTy, LHSHi, RHSLo);
+ auto Mid1Hi = MIRBuilder.buildUMulH(NarrowTy, LHSHi, RHSLo);
+ auto Ovf1 =
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Mid1Hi, Zero);
+
+ // Cross multiply LHSLo × RHSHi with overflow (use MUL+UMULH directly)
+ auto Mid2 = MIRBuilder.buildMul(NarrowTy, LHSLo, RHSHi);
+ auto Mid2Hi = MIRBuilder.buildUMulH(NarrowTy, LHSLo, RHSHi);
+ auto Ovf2 =
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Mid2Hi, Zero);
+
+ // Add the cross products (HighSum = Mid1 + Mid2)
+ auto HighSum = MIRBuilder.buildAdd(NarrowTy, Mid1, Mid2);
+
+ // Multiply low parts to get full 128-bit result (using ZEXT pattern)
+ LLT WideTy = LLT::scalar(Size);
+ auto LHSLoExt = MIRBuilder.buildZExt(WideTy, LHSLo);
+ auto RHSLoExt = MIRBuilder.buildZExt(WideTy, RHSLo);
+ auto FullMul = MIRBuilder.buildMul(WideTy, LHSLoExt, RHSLoExt).getReg(0);
+
+ SmallVector<Register, 2> LowMulParts;
+ extractParts(FullMul, NarrowTy, NumParts, LowMulParts, MIRBuilder, MRI);
+ Register ResLo = LowMulParts[0];
+ Register ResHi = LowMulParts[1];
+
+ // Add HighSum to ResHi with overflow detection
+ auto AddHighSum =
+ MIRBuilder.buildUAddo(NarrowTy, LLT::scalar(1), ResHi, HighSum);
+ Register FinalHi = AddHighSum.getReg(0);
+ Register Ovf3 = AddHighSum.getReg(1);
+
+ // Combine all overflow flags
+ // overflow = BothHiNonZero || Ovf1 || Ovf2 || Ovf3
+ auto Ovf12 = MIRBuilder.buildOr(LLT::scalar(1), Ovf1, Ovf2);
+ auto Ovf123 = MIRBuilder.buildOr(LLT::scalar(1), Ovf12, Ovf3);
+ auto FinalOvf = MIRBuilder.buildOr(LLT::scalar(1), BothHiNonZero, Ovf123);
+
+ // Build final result
+ // Emit G_MERGE_VALUES for the result
+ SmallVector<Register, 2> ResultParts = {ResLo, FinalHi};
+ MIRBuilder.buildMergeLikeInstr(DstReg, ResultParts);
+
+ // Normalize overflow to s1 type
+ MIRBuilder.buildCopy(OverflowReg, FinalOvf);
+
+ MI.eraseFromParent();
+ return Legalized;
+}
+
LegalizerHelper::LegalizeResult
LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
LLT NarrowTy) {
@@ -7348,7 +7442,7 @@ LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
InsertOffset = OpStart - DstStart;
ExtractOffset = 0;
SegSize =
- std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
+ std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
}
Register SegReg = OpReg;
@@ -7399,19 +7493,18 @@ LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
- {Src0Regs[I], Src1Regs[I]});
+ {Src0Regs[I], Src1Regs[I]});
DstRegs.push_back(Inst.getReg(0));
}
for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
- auto Inst = MIRBuilder.buildInstr(
- MI.getOpcode(),
- {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
+ auto Inst =
+ MIRBuilder.buildInstr(MI.getOpcode(), {LeftoverTy},
+ {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
DstLeftoverRegs.push_back(Inst.getReg(0));
}
- insertParts(DstReg, DstTy, NarrowTy, DstRegs,
- LeftoverTy, DstLeftoverRegs);
+ insertParts(DstReg, DstTy, NarrowTy, DstRegs, LeftoverTy, DstLeftoverRegs);
MI.eraseFromParent();
return Legalized;
@@ -7431,7 +7524,8 @@ LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
SmallVector<Register, 8> Parts;
LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
- LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
+ LLT LCMTy =
+ buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
MI.eraseFromParent();
@@ -7466,19 +7560,18 @@ LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
llvm_unreachable("inconsistent extractParts result");
for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
- auto Select = MIRBuilder.buildSelect(NarrowTy,
- CondReg, Src1Regs[I], Src2Regs[I]);
+ auto Select =
+ MIRBuilder.buildSelect(NarrowTy, CondReg, Src1Regs[I], Src2Regs[I]);
DstRegs.push_back(Select.getReg(0));
}
for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
auto Select = MIRBuilder.buildSelect(
- LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
+ LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
DstLeftoverRegs.push_back(Select.getReg(0));
}
- insertParts(DstReg, DstTy, NarrowTy, DstRegs,
- LeftoverTy, DstLeftoverRegs);
+ insertParts(DstReg, DstTy, NarrowTy, DstRegs, LeftoverTy, DstLeftoverRegs);
MI.eraseFromParent();
return Legalized;
@@ -7502,9 +7595,8 @@ LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
auto C_0 = B.buildConstant(NarrowTy, 0);
auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
UnmergeSrc.getReg(1), C_0);
- auto LoCTLZ = IsUndef ?
- B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
- B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
+ auto LoCTLZ = IsUndef ? B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0))
+ : B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
@@ -7535,9 +7627,8 @@ LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
auto C_0 = B.buildConstant(NarrowTy, 0);
auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
UnmergeSrc.getReg(0), C_0);
- auto HiCTTZ = IsUndef ?
- B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
- B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
+ auto HiCTTZ = IsUndef ? B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1))
+ : B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
@@ -7745,7 +7836,7 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
- assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+ assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
// 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
// bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
@@ -8366,8 +8457,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
- auto SignMask = MIRBuilder.buildConstant(SrcTy,
- APInt::getSignMask(SrcEltBits));
+ auto SignMask =
+ MIRBuilder.buildConstant(SrcTy, APInt::getSignMask(SrcEltBits));
auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
@@ -8389,8 +8480,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
const LLT S1 = LLT::scalar(1);
- auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
- S1, Exponent, ExponentLoBit);
+ auto CmpGt =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, Exponent, ExponentLoBit);
R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
@@ -8399,8 +8490,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
- auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
- S1, Exponent, ZeroSrcTy);
+ auto ExponentLt0 =
+ MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, Exponent, ZeroSrcTy);
auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
@@ -8544,13 +8635,13 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
// Subtract the fp64 exponent bias (1023) to get the real exponent and
// add the f16 bias (15) to get the biased exponent for the f16 format.
E = MIRBuilder.buildAdd(
- S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
+ S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
- auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
- MIRBuilder.buildConstant(S32, 0x1ff));
+ auto MaskedSig =
+ MIRBuilder.buildAnd(S32, UH, MIRBuilder.buildConstant(S32, 0x1ff));
MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
auto Zero = MIRBuilder.buildConstant(S32, 0);
@@ -8576,14 +8667,14 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
- auto SigSetHigh = MIRBuilder.buildOr(S32, M,
- MIRBuilder.buildConstant(S32, 0x1000));
+ auto SigSetHigh =
+ MIRBuilder.buildOr(S32, M, MIRBuilder.buildConstant(S32, 0x1000));
auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
auto D0 = MIRBuilder.buildShl(S32, D, B);
- auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
- D0, SigSetHigh);
+ auto D0_NE_SigSetHigh =
+ MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, D0, SigSetHigh);
auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
D = MIRBuilder.buildOr(S32, D, D1);
@@ -8604,13 +8695,13 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
V1 = MIRBuilder.buildOr(S32, V0, V1);
V = MIRBuilder.buildAdd(S32, V, V1);
- auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1,
- E, MIRBuilder.buildConstant(S32, 30));
+ auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, E,
+ MIRBuilder.buildConstant(S32, 30));
V = MIRBuilder.buildSelect(S32, CmpEGt30,
MIRBuilder.buildConstant(S32, 0x7c00), V);
- auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
- E, MIRBuilder.buildConstant(S32, 1039));
+ auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, E,
+ MIRBuilder.buildConstant(S32, 1039));
V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
// Extract the sign bit.
@@ -8729,11 +8820,11 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
const int Src0Size = Src0Ty.getScalarSizeInBits();
const int Src1Size = Src1Ty.getScalarSizeInBits();
- auto SignBitMask = MIRBuilder.buildConstant(
- Src0Ty, APInt::getSignMask(Src0Size));
+ auto SignBitMask =
+ MIRBuilder.buildConstant(Src0Ty, APInt::getSignMask(Src0Size));
auto NotSignBitMask = MIRBuilder.buildConstant(
- Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
+ Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
Register And1;
@@ -8890,8 +8981,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
LLT Ty = MRI.getType(DstReg);
unsigned Flags = MI.getFlags();
- auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
- Flags);
+ auto Mul =
+ MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2), Flags);
MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
MI.eraseFromParent();
return Legalized;
@@ -8944,10 +9035,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
- auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
- SrcReg, Zero, Flags);
- auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
- SrcReg, Trunc, Flags);
+ auto Lt0 =
+ MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy, SrcReg, Zero, Flags);
+ auto NeTrunc =
+ MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy, SrcReg, Trunc, Flags);
auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
@@ -8971,8 +9062,9 @@ LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
Register SrcReg = MI.getOperand(I).getReg();
auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
- Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
- MRI.createGenericVirtualRegister(WideTy);
+ Register NextResult = I + 1 == NumOps && WideTy == DstTy
+ ? DstReg
+ : MRI.createGenericVirtualRegister(WideTy);
auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
@@ -8982,7 +9074,7 @@ LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
if (DstTy.isPointer()) {
if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
- DstTy.getAddressSpace())) {
+ DstTy.getAddressSpace())) {
LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
return UnableToLegalize;
}
@@ -9669,8 +9761,7 @@ LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
return Legalized;
}
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerShlSat(MachineInstr &MI) {
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerShlSat(MachineInstr &MI) {
assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
"Expected shlsat opcode!");
@@ -9825,7 +9916,7 @@ LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
Register ValReg = MI.getOperand(ValRegIndex).getReg();
const LLT Ty = MRI.getType(ValReg);
const MDString *RegStr = cast<MDString>(
- cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
+ cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
if (!PhysReg) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index f8cd868a4c755..94469cf262e3e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -101,19 +101,6 @@ entry:
ret void
}
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %4:_(s128), %5:_(s1) = G_UMULO %0:_, %6:_ (in function: umul_s128)
-; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for umul_s128
-; FALLBACK-WITH-REPORT-OUT-LABEL: umul_s128
-declare {i128, i1} @llvm.umul.with.overflow.i128(i128, i128) nounwind readnone
-define zeroext i1 @umul_s128(i128 %v1, ptr %res) {
-entry:
- %t = call {i128, i1} @llvm.umul.with.overflow.i128(i128 %v1, i128 2)
- %val = extractvalue {i128, i1} %t, 0
- %obit = extractvalue {i128, i1} %t, 1
- store i128 %val, ptr %res
- ret i1 %obit
-}
-
; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: {{.*}}llvm.experimental.gc.statepoint{{.*}} (in function: gc_intr)
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for gc_intr
; FALLBACK-WITH-REPORT-OUT-LABEL: gc_intr
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 3d90e094a5747..472ac0dbcacce 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -2,8 +2,7 @@
; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-; CHECK-GI: warning: Instruction selection used fallback path for test_umul_i128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_smul_i128
+; CHECK-GI: warning: Instruction selection used fallback path for test_smul_i128
define i128 @test_uadd_i128(i128 noundef %x, i128 noundef %y) {
; CHECK-SD-LABEL: test_uadd_i128:
@@ -222,41 +221,87 @@ cleanup:
}
define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
-; CHECK-LABEL: test_umul_i128:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: orr x8, x1, x3
-; CHECK-NEXT: cbz x8, .LBB4_2
-; CHECK-NEXT: // %bb.1: // %overflow
-; CHECK-NEXT: mul x9, x3, x0
-; CHECK-NEXT: cmp x1, #0
-; CHECK-NEXT: ccmp x3, #0, #4, ne
-; CHECK-NEXT: umulh x10, x1, x2
-; CHECK-NEXT: umulh x8, x3, x0
-; CHECK-NEXT: madd x9, x1, x2, x9
-; CHECK-NEXT: ccmp xzr, x10, #0, eq
-; CHECK-NEXT: umulh x11, x0, x2
-; CHECK-NEXT: ccmp xzr, x8, #0, eq
-; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: cset w8, ne
-; CHECK-NEXT: adds x1, x11, x9
-; CHECK-NEXT: csinc w8, w8, wzr, lo
-; CHECK-NEXT: cbnz w8, .LBB4_3
-; CHECK-NEXT: b .LBB4_4
-; CHECK-NEXT: .LBB4_2: // %overflow.no
-; CHECK-NEXT: umulh x1, x0, x2
-; CHECK-NEXT: mul x0, x0, x2
-; CHECK-NEXT: cbz w8, .LBB4_4
-; CHECK-NEXT: .LBB4_3: // %if.then
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl error
-; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT: sxtw x0, w0
-; CHECK-NEXT: asr x1, x0, #63
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: .LBB4_4: // %cleanup
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_umul_i128:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: orr x8, x1, x3
+; CHECK-SD-NEXT: cbz x8, .LBB4_2
+; CHECK-SD-NEXT: // %bb.1: // %overflow
+; CHECK-SD-NEXT: mul x9, x3, x0
+; CHECK-SD-NEXT: cmp x1, #0
+; CHECK-SD-NEXT: ccmp x3, #0, #4, ne
+; CHECK-SD-NEXT: umulh x10, x1, x2
+; CHECK-SD-NEXT: umulh x8, x3, x0
+; CHECK-SD-NEXT: madd x9, x1, x2, x9
+; CHECK-SD-NEXT: ccmp xzr, x10, #0, eq
+; CHECK-SD-NEXT: umulh x11, x0, x2
+; CHECK-SD-NEXT: ccmp xzr, x8, #0, eq
+; CHECK-SD-NEXT: mul x0, x0, x2
+; CHECK-SD-NEXT: cset w8, ne
+; CHECK-SD-NEXT: adds x1, x11, x9
+; CHECK-SD-NEXT: csinc w8, w8, wzr, lo
+; CHECK-SD-NEXT: cbnz w8, .LBB4_3
+; CHECK-SD-NEXT: b .LBB4_4
+; CHECK-SD-NEXT: .LBB4_2: // %overflow.no
+; CHECK-SD-NEXT: umulh x1, x0, x2
+; CHECK-SD-NEXT: mul x0, x0, x2
+; CHECK-SD-NEXT: cbz w8, .LBB4_4
+; CHECK-SD-NEXT: .LBB4_3: // %if.then
+; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: bl error
+; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT: sxtw x0, w0
+; CHECK-SD-NEXT: asr x1, x0, #63
+; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: .LBB4_4: // %cleanup
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_umul_i128:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: orr x8, x1, x3
+; CHECK-GI-NEXT: cbz x8, .LBB4_2
+; CHECK-GI-NEXT: // %bb.1: // %overflow
+; CHECK-GI-NEXT: umulh x8, x1, x2
+; CHECK-GI-NEXT: cmp x1, #0
+; CHECK-GI-NEXT: cset w12, ne
+; CHECK-GI-NEXT: cmp x3, #0
+; CHECK-GI-NEXT: mul x9, x0, x3
+; CHECK-GI-NEXT: cset w13, ne
+; CHECK-GI-NEXT: and w12, w12, w13
+; CHECK-GI-NEXT: umulh x10, x0, x3
+; CHECK-GI-NEXT: cmp x8, #0
+; CHECK-GI-NEXT: madd x9, x1, x2, x9
+; CHECK-GI-NEXT: cset w8, ne
+; CHECK-GI-NEXT: umulh x11, x0, x2
+; CHECK-GI-NEXT: cmp x10, #0
+; CHECK-GI-NEXT: mul x0, x0, x2
+; CHECK-GI-NEXT: cset w10, ne
+; CHECK-GI-NEXT: orr w8, w8, w10
+; CHECK-GI-NEXT: orr w8, w12, w8
+; CHECK-GI-NEXT: adds x1, x11, x9
+; CHECK-GI-NEXT: cset w9, hs
+; CHECK-GI-NEXT: orr w8, w8, w9
+; CHECK-GI-NEXT: tbnz w8, #0, .LBB4_3
+; CHECK-GI-NEXT: b .LBB4_4
+; CHECK-GI-NEXT: .LBB4_2: // %overflow.no
+; CHECK-GI-NEXT: mov x8, x0
+; CHECK-GI-NEXT: mul x0, x0, x2
+; CHECK-GI-NEXT: umulh x1, x8, x2
+; CHECK-GI-NEXT: mov w8, #0 // =0x0
+; CHECK-GI-NEXT: tbz w8, #0, .LBB4_4
+; CHECK-GI-NEXT: .LBB4_3: // %if.then
+; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: bl error
+; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-GI-NEXT: asr w1, w0, #31
+; CHECK-GI-NEXT: bfi x0, x1, #32, #32
+; CHECK-GI-NEXT: bfi x1, x1, #32, #32
+; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT: .LBB4_4: // %cleanup
+; CHECK-GI-NEXT: ret
entry:
%0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
%1 = extractvalue { i128, i1 } %0, 1
More information about the llvm-commits
mailing list