[llvm] [AArch64] Use isKnownNonZero to optimize eligible compares to cmn (PR #96349)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Jun 29 07:24:41 PDT 2024
https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/96349
>From 9518e2e07f435c2c1f82bf2bdc04ac7ba1def721 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 21 Jun 2024 15:12:41 -0400
Subject: [PATCH 1/5] Pre-commit tests (NFC)
---
llvm/test/CodeGen/AArch64/cmp-chains.ll | 32 ++++++++++++++++++++
llvm/test/CodeGen/AArch64/cmp-select-sign.ll | 15 +++++++++
2 files changed, 47 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index 14cb0c82b1c03..d51c9c946f467 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -258,3 +258,35 @@ define i32 @neg_range_int(i32 %a, i32 %b, i32 %c) {
ret i32 %retval.0
}
+; (b > -3 || a < -(c | 1))
+define i32 @neg_range_int_cmn(i32 %a, i32 %b, i32 %c) {
+; SDISEL-LABEL: neg_range_int_cmn:
+; SDISEL: // %bb.0:
+; SDISEL-NEXT: orr w8, w2, #0x1
+; SDISEL-NEXT: neg w8, w8
+; SDISEL-NEXT: cmp w8, w0
+; SDISEL-NEXT: ccmn w1, #3, #0, le
+; SDISEL-NEXT: csel w0, w1, w0, gt
+; SDISEL-NEXT: ret
+;
+; GISEL-LABEL: neg_range_int_cmn:
+; GISEL: // %bb.0:
+; GISEL-NEXT: orr w8, w2, #0x1
+; GISEL-NEXT: cmn w1, #3
+; GISEL-NEXT: neg w8, w8
+; GISEL-NEXT: cset w9, gt
+; GISEL-NEXT: cmp w8, w0
+; GISEL-NEXT: cset w8, gt
+; GISEL-NEXT: orr w8, w9, w8
+; GISEL-NEXT: and w8, w8, #0x1
+; GISEL-NEXT: tst w8, #0x1
+; GISEL-NEXT: csel w0, w1, w0, ne
+; GISEL-NEXT: ret
+ %or = or i32 %c, 1
+ %sub = sub nsw i32 0, %or
+ %cmp = icmp sgt i32 %b, -3
+ %cmp1 = icmp sgt i32 %sub, %a
+ %1 = select i1 %cmp, i1 true, i1 %cmp1
+ %ret = select i1 %1, i32 %b, i32 %a
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index 09a6e26fe5a40..ca20a7a435a64 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -262,4 +262,19 @@ define <4 x i65> @sign_4xi65(<4 x i65> %a) {
ret <4 x i65> %res
}
+define i32 @or_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: or_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: orr w8, w0, #0x1
+; CHECK-NEXT: neg w8, w8
+; CHECK-NEXT: cmp w8, w1
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %3 = or i32 %x, 1
+ %4 = sub nsw i32 0, %3
+ %5 = icmp sgt i32 %4, %y
+ %6 = zext i1 %5 to i32
+ ret i32 %6
+}
+
declare void @use_4xi1(<4 x i1>)
>From 51c21b83390893e1a954c36d8e6f15fb9e7f23c1 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Fri, 21 Jun 2024 15:26:02 -0400
Subject: [PATCH 2/5] [AArch64] Use isKnownNonZero to optimize to cmn instead
of cmp
---
.../Target/AArch64/AArch64ISelLowering.cpp | 44 +++++++++++++++----
llvm/test/CodeGen/AArch64/cmp-chains.ll | 3 +-
llvm/test/CodeGen/AArch64/cmp-select-sign.ll | 3 +-
3 files changed, 38 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0d53f71a4def8..c0c2891113b14 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3396,9 +3396,11 @@ static bool isLegalArithImmed(uint64_t C) {
// So, finally, the only LLVM-native comparisons that don't mention C and V
// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
// the absence of information about op2.
-static bool isCMN(SDValue Op, ISD::CondCode CC) {
+static bool isCMN(SDValue Op, SDValue CheckedVal, ISD::CondCode CC,
+ SelectionDAG &DAG) {
return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
- (CC == ISD::SETEQ || CC == ISD::SETNE);
+ (CC == ISD::SETEQ || CC == ISD::SETNE ||
+ DAG.isKnownNeverZero(CheckedVal));
}
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
@@ -3443,15 +3445,27 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
- if (isCMN(RHS, CC)) {
+ if (isCMN(RHS, RHS.getOperand(1), CC, DAG)) {
// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
- } else if (isCMN(LHS, CC)) {
+ } else if (isCMN(LHS, RHS, CC, DAG) &&
+ (!isUnsignedIntSetCC(CC) ||
+ isCMN(LHS, LHS.getOperand(1), CC, DAG))) {
// As we are looking for EQ/NE compares, the operands can be commuted ; can
// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
+ // Not swapping operands, but negation requires inversion
+ CC = ISD::getSetCCSwappedOperands(CC);
Opcode = AArch64ISD::ADDS;
LHS = LHS.getOperand(1);
+ } else if (isCMN(LHS, LHS.getOperand(1), CC, DAG) &&
+ (!isUnsignedIntSetCC(CC) || isCMN(LHS, RHS, CC, DAG))) {
+ // As we are looking for EQ/NE compares, the operands can be commuted ; can
+ // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+ Opcode = AArch64ISD::ADDS;
+ RHS = RHS.getOperand(1);
} else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
if (LHS.getOpcode() == ISD::AND) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
@@ -3551,11 +3565,24 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
}
} else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
- if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE ||
+ DAG.isKnownNeverZero(RHS.getOperand(1)))) {
// See emitComparison() on why we can only do this for SETEQ and SETNE.
Opcode = AArch64ISD::CCMN;
RHS = RHS.getOperand(1);
}
+ } else if (LHS.getOpcode() == ISD::SUB) {
+ SDValue SubOp0 = RHS.getOperand(0);
+ if (isNullConstant(SubOp0) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE ||
+ (DAG.isKnownNeverZero(LHS.getOperand(1)) &&
+ (!isUnsignedIntSetCC(CC) || DAG.isKnownNeverZero(RHS))))) {
+ // See emitComparison() on why we can only do this for SETEQ and SETNE.
+ std::swap(LHS, RHS);
+ CC = ISD::getSetCCSwappedOperands(CC);
+ Opcode = AArch64ISD::CCMN;
+ RHS = RHS.getOperand(1);
+ }
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
@@ -3871,9 +3898,10 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
// cmp w13, w12
// can be turned into:
// cmp w12, w11, lsl #1
- if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {
- SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
-
+ if (!isa<ConstantSDNode>(RHS) || (!isLegalArithImmed(RHS->getAsZExtVal()) &&
+ !isLegalArithImmed(-RHS->getAsZExtVal()))) {
+ SDValue TheLHS =
+ isCMN(LHS, LHS.getOperand(1), CC, DAG) ? LHS.getOperand(1) : LHS;
if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
diff --git a/llvm/test/CodeGen/AArch64/cmp-chains.ll b/llvm/test/CodeGen/AArch64/cmp-chains.ll
index d51c9c946f467..4ea515911b0c5 100644
--- a/llvm/test/CodeGen/AArch64/cmp-chains.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-chains.ll
@@ -263,8 +263,7 @@ define i32 @neg_range_int_cmn(i32 %a, i32 %b, i32 %c) {
; SDISEL-LABEL: neg_range_int_cmn:
; SDISEL: // %bb.0:
; SDISEL-NEXT: orr w8, w2, #0x1
-; SDISEL-NEXT: neg w8, w8
-; SDISEL-NEXT: cmp w8, w0
+; SDISEL-NEXT: cmn w0, w8
; SDISEL-NEXT: ccmn w1, #3, #0, le
; SDISEL-NEXT: csel w0, w1, w0, gt
; SDISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
index ca20a7a435a64..036d8202a22b3 100644
--- a/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-select-sign.ll
@@ -266,8 +266,7 @@ define i32 @or_neg(i32 %x, i32 %y) {
; CHECK-LABEL: or_neg:
; CHECK: // %bb.0:
; CHECK-NEXT: orr w8, w0, #0x1
-; CHECK-NEXT: neg w8, w8
-; CHECK-NEXT: cmp w8, w1
+; CHECK-NEXT: cmn w1, w8
; CHECK-NEXT: cset w0, gt
; CHECK-NEXT: ret
%3 = or i32 %x, 1
>From 40a4553177303b0b4ac366012cc868cefb996705 Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234 at gmail.com>
Date: Sat, 29 Jun 2024 10:01:53 -0400
Subject: [PATCH 3/5] Update AArch64ISelLowering.cpp
---
.../Target/AArch64/AArch64ISelLowering.cpp | 1969 +----------------
1 file changed, 3 insertions(+), 1966 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c0c2891113b14..d96276c8b6de6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3902,7 +3902,8 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
!isLegalArithImmed(-RHS->getAsZExtVal()))) {
SDValue TheLHS =
isCMN(LHS, LHS.getOperand(1), CC, DAG) ? LHS.getOperand(1) : LHS;
- if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
+ SDValue TheRHS = !isa<ConstantSDNode>(RHS) && isCMN(RHS, RHS.getOperand(1), CC, DAG) ? RHS.getOperand(1) : RHS;
+ if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(TheRHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
@@ -26537,1968 +26538,4 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
- // MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction(Subtarget->getSecurityCheckCookieName());
- return TargetLowering::getSSPStackGuardCheck(M);
-}
-
-Value *
-AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
- // Android provides a fixed TLS slot for the SafeStack pointer. See the
- // definition of TLS_SLOT_SAFESTACK in
- // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- if (Subtarget->isTargetAndroid())
- return UseTlsOffset(IRB, 0x48);
-
- // Fuchsia is similar.
- // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
- if (Subtarget->isTargetFuchsia())
- return UseTlsOffset(IRB, -0x8);
-
- return TargetLowering::getSafeStackPointerLocation(IRB);
-}
-
-bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
- const Instruction &AndI) const {
- // Only sink 'and' mask to cmp use block if it is masking a single bit, since
- // this is likely to be fold the and/cmp/br into a single tbz instruction. It
- // may be beneficial to sink in other cases, but we would have to check that
- // the cmp would not get folded into the br to form a cbz for these to be
- // beneficial.
- ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
- if (!Mask)
- return false;
- return Mask->getValue().isPowerOf2();
-}
-
-bool AArch64TargetLowering::
- shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
- SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
- unsigned OldShiftOpcode, unsigned NewShiftOpcode,
- SelectionDAG &DAG) const {
- // Does baseline recommend not to perform the fold by default?
- if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
- X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
- return false;
- // Else, if this is a vector shift, prefer 'shl'.
- return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
-}
-
-TargetLowering::ShiftLegalizationStrategy
-AArch64TargetLowering::preferredShiftLegalizationStrategy(
- SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
- if (DAG.getMachineFunction().getFunction().hasMinSize() &&
- !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
- return ShiftLegalizationStrategy::LowerToLibcall;
- return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
- ExpansionFactor);
-}
-
-void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
- // Update IsSplitCSR in AArch64unctionInfo.
- AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
- AFI->setIsSplitCSR(true);
-}
-
-void AArch64TargetLowering::insertCopiesSplitCSR(
- MachineBasicBlock *Entry,
- const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
- const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
- const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
- if (!IStart)
- return;
-
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
- MachineBasicBlock::iterator MBBI = Entry->begin();
- for (const MCPhysReg *I = IStart; *I; ++I) {
- const TargetRegisterClass *RC = nullptr;
- if (AArch64::GPR64RegClass.contains(*I))
- RC = &AArch64::GPR64RegClass;
- else if (AArch64::FPR64RegClass.contains(*I))
- RC = &AArch64::FPR64RegClass;
- else
- llvm_unreachable("Unexpected register class in CSRsViaCopy!");
-
- Register NewVR = MRI->createVirtualRegister(RC);
- // Create copy from CSR to a virtual register.
- // FIXME: this currently does not emit CFI pseudo-instructions, it works
- // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
- // nounwind. If we want to generalize this later, we may need to emit
- // CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
- Entry->addLiveIn(*I);
- BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
- .addReg(*I);
-
- // Insert the copy-back instructions right before the terminator.
- for (auto *Exit : Exits)
- BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::COPY), *I)
- .addReg(NewVR);
- }
-}
-
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
- // Integer division on AArch64 is expensive. However, when aggressively
- // optimizing for code size, we prefer to use a div instruction, as it is
- // usually smaller than the alternative sequence.
- // The exception to this is vector division. Since AArch64 doesn't have vector
- // integer division, leaving the division as-is is a loss even in terms of
- // size, because it will have to be scalarized, while the alternative code
- // sequence can be performed in vector form.
- bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
- return OptSize && !VT.isVector();
-}
-
-bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
- // We want inc-of-add for scalars and sub-of-not for vectors.
- return VT.isScalarInteger();
-}
-
-bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
- EVT VT) const {
- // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
- // legalize.
- if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
- return false;
- if (FPVT == MVT::v8bf16)
- return false;
- return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
-}
-
-MachineInstr *
-AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
- MachineBasicBlock::instr_iterator &MBBI,
- const TargetInstrInfo *TII) const {
- assert(MBBI->isCall() && MBBI->getCFIType() &&
- "Invalid call instruction for a KCFI check");
-
- switch (MBBI->getOpcode()) {
- case AArch64::BLR:
- case AArch64::BLRNoIP:
- case AArch64::TCRETURNri:
- case AArch64::TCRETURNrix16x17:
- case AArch64::TCRETURNrix17:
- case AArch64::TCRETURNrinotx16:
- break;
- default:
- llvm_unreachable("Unexpected CFI call opcode");
- }
-
- MachineOperand &Target = MBBI->getOperand(0);
- assert(Target.isReg() && "Invalid target operand for an indirect call");
- Target.setIsRenamable(false);
-
- return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
- .addReg(Target.getReg())
- .addImm(MBBI->getCFIType())
- .getInstr();
-}
-
-bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
- return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
-}
-
-unsigned
-AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
- if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
- return getPointerTy(DL).getSizeInBits();
-
- return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
-}
-
-void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
- MachineFrameInfo &MFI = MF.getFrameInfo();
- // If we have any vulnerable SVE stack objects then the stack protector
- // needs to be placed at the top of the SVE stack area, as the SVE locals
- // are placed above the other locals, so we allocate it as if it were a
- // scalable vector.
- // FIXME: It may be worthwhile having a specific interface for this rather
- // than doing it here in finalizeLowering.
- if (MFI.hasStackProtectorIndex()) {
- for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
- if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
- MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
- MFI.setStackID(MFI.getStackProtectorIndex(),
- TargetStackID::ScalableVector);
- MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
- break;
- }
- }
- }
- MFI.computeMaxCallFrameSize(MF);
- TargetLoweringBase::finalizeLowering(MF);
-}
-
-// Unlike X86, we let frame lowering assign offsets to all catch objects.
-bool AArch64TargetLowering::needsFixedCatchObjects() const {
- return false;
-}
-
-bool AArch64TargetLowering::shouldLocalize(
- const MachineInstr &MI, const TargetTransformInfo *TTI) const {
- auto &MF = *MI.getMF();
- auto &MRI = MF.getRegInfo();
- auto maxUses = [](unsigned RematCost) {
- // A cost of 1 means remats are basically free.
- if (RematCost == 1)
- return std::numeric_limits<unsigned>::max();
- if (RematCost == 2)
- return 2U;
-
- // Remat is too expensive, only sink if there's one user.
- if (RematCost > 2)
- return 1U;
- llvm_unreachable("Unexpected remat cost");
- };
-
- unsigned Opc = MI.getOpcode();
- switch (Opc) {
- case TargetOpcode::G_GLOBAL_VALUE: {
- // On Darwin, TLS global vars get selected into function calls, which
- // we don't want localized, as they can get moved into the middle of a
- // another call sequence.
- const GlobalValue &GV = *MI.getOperand(1).getGlobal();
- if (GV.isThreadLocal() && Subtarget->isTargetMachO())
- return false;
- return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
- }
- case TargetOpcode::G_FCONSTANT:
- case TargetOpcode::G_CONSTANT: {
- const ConstantInt *CI;
- unsigned AdditionalCost = 0;
-
- if (Opc == TargetOpcode::G_CONSTANT)
- CI = MI.getOperand(1).getCImm();
- else {
- LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- // We try to estimate cost of 32/64b fpimms, as they'll likely be
- // materialized as integers.
- if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
- break;
- auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
- bool OptForSize =
- MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
- if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),
- OptForSize))
- return true; // Constant should be cheap.
- CI =
- ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
- // FP materialization also costs an extra move, from gpr to fpr.
- AdditionalCost = 1;
- }
- APInt Imm = CI->getValue();
- InstructionCost Cost = TTI->getIntImmCost(
- Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
- assert(Cost.isValid() && "Expected a valid imm cost");
-
- unsigned RematCost = *Cost.getValue();
- RematCost += AdditionalCost;
- Register Reg = MI.getOperand(0).getReg();
- unsigned MaxUses = maxUses(RematCost);
- // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
- if (MaxUses == std::numeric_limits<unsigned>::max())
- --MaxUses;
- return MRI.hasAtMostUserInstrs(Reg, MaxUses);
- }
- // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
- // localizable.
- case AArch64::ADRP:
- case AArch64::G_ADD_LOW:
- // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
- case TargetOpcode::G_PTR_ADD:
- return true;
- default:
- break;
- }
- return TargetLoweringBase::shouldLocalize(MI, TTI);
-}
-
-bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
- // Fallback for scalable vectors.
- // Note that if EnableSVEGISel is true, we allow scalable vector types for
- // all instructions, regardless of whether they are actually supported.
- if (!EnableSVEGISel) {
- if (Inst.getType()->isScalableTy()) {
- return true;
- }
-
- for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
- if (Inst.getOperand(i)->getType()->isScalableTy())
- return true;
-
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
- if (AI->getAllocatedType()->isScalableTy())
- return true;
- }
- }
-
- // Checks to allow the use of SME instructions
- if (auto *Base = dyn_cast<CallBase>(&Inst)) {
- auto CallerAttrs = SMEAttrs(*Inst.getFunction());
- auto CalleeAttrs = SMEAttrs(*Base);
- if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
- CallerAttrs.requiresLazySave(CalleeAttrs) ||
- CallerAttrs.requiresPreservingZT0(CalleeAttrs))
- return true;
- }
- return false;
-}
-
-// Return the largest legal scalable vector type that matches VT's element type.
-static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
- assert(VT.isFixedLengthVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal fixed length vector!");
- switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for SVE container");
- case MVT::i8:
- return EVT(MVT::nxv16i8);
- case MVT::i16:
- return EVT(MVT::nxv8i16);
- case MVT::i32:
- return EVT(MVT::nxv4i32);
- case MVT::i64:
- return EVT(MVT::nxv2i64);
- case MVT::bf16:
- return EVT(MVT::nxv8bf16);
- case MVT::f16:
- return EVT(MVT::nxv8f16);
- case MVT::f32:
- return EVT(MVT::nxv4f32);
- case MVT::f64:
- return EVT(MVT::nxv2f64);
- }
-}
-
-// Return a PTRUE with active lanes corresponding to the extent of VT.
-static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
- EVT VT) {
- assert(VT.isFixedLengthVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal fixed length vector!");
-
- std::optional<unsigned> PgPattern =
- getSVEPredPatternFromNumElements(VT.getVectorNumElements());
- assert(PgPattern && "Unexpected element count for SVE predicate");
-
- // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
- // AArch64SVEPredPattern::all, which can enable the use of unpredicated
- // variants of instructions when available.
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
- if (MaxSVESize && MinSVESize == MaxSVESize &&
- MaxSVESize == VT.getSizeInBits())
- PgPattern = AArch64SVEPredPattern::all;
-
- MVT MaskVT;
- switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for SVE predicate");
- case MVT::i8:
- MaskVT = MVT::nxv16i1;
- break;
- case MVT::i16:
- case MVT::f16:
- case MVT::bf16:
- MaskVT = MVT::nxv8i1;
- break;
- case MVT::i32:
- case MVT::f32:
- MaskVT = MVT::nxv4i1;
- break;
- case MVT::i64:
- case MVT::f64:
- MaskVT = MVT::nxv2i1;
- break;
- }
-
- return getPTrue(DAG, DL, MaskVT, *PgPattern);
-}
-
-static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
- EVT VT) {
- assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal scalable vector!");
- auto PredTy = VT.changeVectorElementType(MVT::i1);
- return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
-}
-
-static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
- if (VT.isFixedLengthVector())
- return getPredicateForFixedLengthVector(DAG, DL, VT);
-
- return getPredicateForScalableVector(DAG, DL, VT);
-}
-
-// Grow V to consume an entire SVE register.
-static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
- assert(VT.isScalableVector() &&
- "Expected to convert into a scalable vector!");
- assert(V.getValueType().isFixedLengthVector() &&
- "Expected a fixed length vector operand!");
- SDLoc DL(V);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
-}
-
-// Shrink V so it's just big enough to maintain a VT's worth of data.
-static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
- assert(VT.isFixedLengthVector() &&
- "Expected to convert into a fixed length vector!");
- assert(V.getValueType().isScalableVector() &&
- "Expected a scalable vector operand!");
- SDLoc DL(V);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
-}
-
-// Convert all fixed length vector loads larger than NEON to masked_loads.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Load = cast<LoadSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT LoadVT = ContainerVT;
- EVT MemVT = Load->getMemoryVT();
-
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- if (VT.isFloatingPoint()) {
- LoadVT = ContainerVT.changeTypeToInteger();
- MemVT = MemVT.changeTypeToInteger();
- }
-
- SDValue NewLoad = DAG.getMaskedLoad(
- LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
- DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
- Load->getAddressingMode(), Load->getExtensionType());
-
- SDValue Result = NewLoad;
- if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
- EVT ExtendVT = ContainerVT.changeVectorElementType(
- Load->getMemoryVT().getVectorElementType());
-
- Result = getSVESafeBitCast(ExtendVT, Result, DAG);
- Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, Result, DAG.getUNDEF(ContainerVT));
- } else if (VT.isFloatingPoint()) {
- Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
- }
-
- Result = convertFromScalableVector(DAG, VT, Result);
- SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
- return DAG.getMergeValues(MergedValues, DL);
-}
-
-static SDValue convertFixedMaskToScalableVector(SDValue Mask,
- SelectionDAG &DAG) {
- SDLoc DL(Mask);
- EVT InVT = Mask.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
-
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
-
- if (ISD::isBuildVectorAllOnes(Mask.getNode()))
- return Pg;
-
- auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
- auto Op2 = DAG.getConstant(0, DL, ContainerVT);
-
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
- {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
-}
-
-// Convert all fixed length vector loads larger than NEON to masked_loads.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Load = cast<MaskedLoadSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- SDValue Mask = Load->getMask();
- // If this is an extending load and the mask type is not the same as
- // load's type then we have to extend the mask type.
- if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
- assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
- "Incorrect mask type");
- Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
- }
- Mask = convertFixedMaskToScalableVector(Mask, DAG);
-
- SDValue PassThru;
- bool IsPassThruZeroOrUndef = false;
-
- if (Load->getPassThru()->isUndef()) {
- PassThru = DAG.getUNDEF(ContainerVT);
- IsPassThruZeroOrUndef = true;
- } else {
- if (ContainerVT.isInteger())
- PassThru = DAG.getConstant(0, DL, ContainerVT);
- else
- PassThru = DAG.getConstantFP(0, DL, ContainerVT);
- if (isZerosVector(Load->getPassThru().getNode()))
- IsPassThruZeroOrUndef = true;
- }
-
- SDValue NewLoad = DAG.getMaskedLoad(
- ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
- Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
- Load->getAddressingMode(), Load->getExtensionType());
-
- SDValue Result = NewLoad;
- if (!IsPassThruZeroOrUndef) {
- SDValue OldPassThru =
- convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
- Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
- }
-
- Result = convertFromScalableVector(DAG, VT, Result);
- SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
- return DAG.getMergeValues(MergedValues, DL);
-}
-
-// Convert all fixed length vector stores larger than NEON to masked_stores.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Store = cast<StoreSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Store->getValue().getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT MemVT = Store->getMemoryVT();
-
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
- auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
-
- if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
- EVT TruncVT = ContainerVT.changeVectorElementType(
- Store->getMemoryVT().getVectorElementType());
- MemVT = MemVT.changeTypeToInteger();
- NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
- NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
- DAG.getUNDEF(TruncVT));
- NewValue =
- getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
- } else if (VT.isFloatingPoint()) {
- MemVT = MemVT.changeTypeToInteger();
- NewValue =
- getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
- }
-
- return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
- Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
- Store->getMemOperand(), Store->getAddressingMode(),
- Store->isTruncatingStore());
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto *Store = cast<MaskedStoreSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Store->getValue().getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
- SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
-
- return DAG.getMaskedStore(
- Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
- Mask, Store->getMemoryVT(), Store->getMemOperand(),
- Store->getAddressingMode(), Store->isTruncatingStore());
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- EVT VT = Op.getValueType();
- EVT EltVT = VT.getVectorElementType();
-
- bool Signed = Op.getOpcode() == ISD::SDIV;
- unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
-
- bool Negated;
- uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
- SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
-
- SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
- SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
- if (Negated)
- Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
- DAG.getConstant(0, dl, ContainerVT), Res);
-
- return convertFromScalableVector(DAG, VT, Res);
- }
-
- // Scalable vector i32/i64 DIV is supported.
- if (EltVT == MVT::i32 || EltVT == MVT::i64)
- return LowerToPredicatedOp(Op, DAG, PredOpcode);
-
- // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
- EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
- EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
- unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-
- // If the wider type is legal: extend, op, and truncate.
- EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
- if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
- SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
- SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
- SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
- }
-
- auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
- &ExtendOpcode](SDValue Op) {
- SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
- SDValue IdxHalf =
- DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
- return std::pair<SDValue, SDValue>(
- {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
- DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
- };
-
- // If wider type is not legal: split, extend, op, trunc and concat.
- auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
- auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
- SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
- SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
- SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
- SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
- Val = convertToScalableVector(DAG, ContainerVT, Val);
-
- bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
- unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
-
- // Repeatedly unpack Val until the result is of the desired element type.
- switch (ContainerVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unimplemented container type");
- case MVT::nxv16i8:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
- if (VT.getVectorElementType() == MVT::i16)
- break;
- [[fallthrough]];
- case MVT::nxv8i16:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
- if (VT.getVectorElementType() == MVT::i32)
- break;
- [[fallthrough]];
- case MVT::nxv4i32:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
- assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
- break;
- }
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
- Val = convertToScalableVector(DAG, ContainerVT, Val);
-
- // Repeatedly truncate Val until the result is of the desired element type.
- switch (ContainerVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unimplemented container type");
- case MVT::nxv2i64:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
- if (VT.getVectorElementType() == MVT::i32)
- break;
- [[fallthrough]];
- case MVT::nxv4i32:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
- if (VT.getVectorElementType() == MVT::i16)
- break;
- [[fallthrough]];
- case MVT::nxv8i16:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
- assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
- break;
- }
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- EVT InVT = Op.getOperand(0).getValueType();
- assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- EVT InVT = Op.getOperand(0).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
-
- auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
- Op.getOperand(1), Op.getOperand(2));
-
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-// Convert vector operation 'Op' to an equivalent predicated operation whereby
-// the original operation's type is used to construct a suitable predicate.
-// NOTE: The results for inactive lanes are undefined.
-SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
- SelectionDAG &DAG,
- unsigned NewOp) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
- auto Pg = getPredicateForVector(DAG, DL, VT);
-
- if (VT.isFixedLengthVector()) {
- assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- // Create list of operands by converting existing ones to scalable types.
- SmallVector<SDValue, 4> Operands = {Pg};
- for (const SDValue &V : Op->op_values()) {
- if (isa<CondCodeSDNode>(V)) {
- Operands.push_back(V);
- continue;
- }
-
- if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
- EVT VTArg = VTNode->getVT().getVectorElementType();
- EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
- Operands.push_back(DAG.getValueType(NewVTArg));
- continue;
- }
-
- assert(isTypeLegal(V.getValueType()) &&
- "Expected only legal fixed-width types");
- Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
- }
-
- if (isMergePassthruOpcode(NewOp))
- Operands.push_back(DAG.getUNDEF(ContainerVT));
-
- auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
- return convertFromScalableVector(DAG, VT, ScalableRes);
- }
-
- assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
-
- SmallVector<SDValue, 4> Operands = {Pg};
- for (const SDValue &V : Op->op_values()) {
- assert((!V.getValueType().isVector() ||
- V.getValueType().isScalableVector()) &&
- "Only scalable vectors are supported!");
- Operands.push_back(V);
- }
-
- if (isMergePassthruOpcode(NewOp))
- Operands.push_back(DAG.getUNDEF(VT));
-
- return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
-}
-
-// If a fixed length vector operation has no side effects when applied to
-// undefined elements, we can safely use scalable vectors to perform the same
-// operation without needing to worry about predication.
-SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
- "Only expected to lower fixed length vector operation!");
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- // Create list of operands by converting existing ones to scalable types.
- SmallVector<SDValue, 4> Ops;
- for (const SDValue &V : Op->op_values()) {
- assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-
- // Pass through non-vector operands.
- if (!V.getValueType().isVector()) {
- Ops.push_back(V);
- continue;
- }
-
- // "cast" fixed length vector to a scalable vector.
- assert(V.getValueType().isFixedLengthVector() &&
- isTypeLegal(V.getValueType()) &&
- "Only fixed length vectors are supported!");
- Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
- }
-
- auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ScalarOp);
- SDValue AccOp = ScalarOp.getOperand(0);
- SDValue VecOp = ScalarOp.getOperand(1);
- EVT SrcVT = VecOp.getValueType();
- EVT ResVT = SrcVT.getVectorElementType();
-
- EVT ContainerVT = SrcVT;
- if (SrcVT.isFixedLengthVector()) {
- ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
- VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
- }
-
- SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
-
- // Convert operands to Scalable.
- AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
- DAG.getUNDEF(ContainerVT), AccOp, Zero);
-
- // Perform reduction.
- SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
- Pg, AccOp, VecOp);
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
-}
-
-SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ReduceOp);
- SDValue Op = ReduceOp.getOperand(0);
- EVT OpVT = Op.getValueType();
- EVT VT = ReduceOp.getValueType();
-
- if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
- return SDValue();
-
- SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
-
- switch (ReduceOp.getOpcode()) {
- default:
- return SDValue();
- case ISD::VECREDUCE_OR:
- if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
- // The predicate can be 'Op' because
- // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
- return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
- else
- return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
- case ISD::VECREDUCE_AND: {
- Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
- return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
- }
- case ISD::VECREDUCE_XOR: {
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
- if (OpVT == MVT::nxv1i1) {
- // Emulate a CNTP on .Q using .D and a different governing predicate.
- Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
- }
- SDValue Cntp =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
- return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
- }
- }
-
- return SDValue();
-}
-
-SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
- SDValue ScalarOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ScalarOp);
- SDValue VecOp = ScalarOp.getOperand(0);
- EVT SrcVT = VecOp.getValueType();
-
- if (useSVEForFixedLengthVectorVT(
- SrcVT,
- /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
- VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
- }
-
- // UADDV always returns an i64 result.
- EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
- SrcVT.getVectorElementType();
- EVT RdxVT = SrcVT;
- if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
- RdxVT = getPackedSVEVectorVT(ResVT);
-
- SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
- SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
- Rdx, DAG.getConstant(0, DL, MVT::i64));
-
- // The VEC_REDUCE nodes expect an element size result.
- if (ResVT != ScalarOp.getValueType())
- Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
-
- return Res;
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
-
- EVT InVT = Op.getOperand(1).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
- SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
-
- // Convert the mask to a predicated (NOTE: We don't need to worry about
- // inactive lanes since VSELECT is safe when given undefined elements).
- EVT MaskVT = Op.getOperand(0).getValueType();
- EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
- auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
- Mask = DAG.getNode(ISD::TRUNCATE, DL,
- MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
-
- auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
- Mask, Op1, Op2);
-
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT InVT = Op.getOperand(0).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
-
- assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
- "Only expected to lower fixed length vector operation!");
- assert(Op.getValueType() == InVT.changeTypeToInteger() &&
- "Expected integer result of the same bit length as the inputs!");
-
- auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
- auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
-
- EVT CmpVT = Pg.getValueType();
- auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
- {Pg, Op1, Op2, Op.getOperand(2)});
-
- EVT PromoteVT = ContainerVT.changeTypeToInteger();
- auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
- return convertFromScalableVector(DAG, Op.getValueType(), Promote);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- auto SrcOp = Op.getOperand(0);
- EVT VT = Op.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT =
- getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
-
- SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
- Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
- return convertFromScalableVector(DAG, VT, Op);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- unsigned NumOperands = Op->getNumOperands();
-
- assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
- "Unexpected number of operands in CONCAT_VECTORS");
-
- auto SrcOp1 = Op.getOperand(0);
- auto SrcOp2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
- EVT SrcVT = SrcOp1.getValueType();
-
- if (NumOperands > 2) {
- SmallVector<SDValue, 4> Ops;
- EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
- for (unsigned I = 0; I < NumOperands; I += 2)
- Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
- Op->getOperand(I), Op->getOperand(I + 1)));
-
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
- }
-
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
- SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
- SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
-
- Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
-
- return convertFromScalableVector(DAG, VT, Op);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- SDValue Pg = getPredicateForVector(DAG, DL, VT);
- EVT SrcVT = Val.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ExtendVT = ContainerVT.changeVectorElementType(
- SrcVT.getVectorElementType());
-
- Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
-
- Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
- Val = getSVESafeBitCast(ExtendVT, Val, DAG);
- Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, Val, DAG.getUNDEF(ContainerVT));
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
- EVT RoundVT = ContainerSrcVT.changeVectorElementType(
- VT.getVectorElementType());
- SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
-
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
- Op.getOperand(1), DAG.getUNDEF(RoundVT));
- Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
- Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
-
- Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
- return DAG.getNode(ISD::BITCAST, DL, VT, Val);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
- unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
- : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-
- if (VT.bitsGE(SrcVT)) {
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
- VT.changeTypeToInteger(), Val);
-
- // Safe to use a larger than specified operand because by promoting the
- // value nothing has changed from an arithmetic point of view.
- Val =
- convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
- DAG.getUNDEF(ContainerDstVT));
- return convertFromScalableVector(DAG, VT, Val);
- } else {
- EVT CvtVT = ContainerSrcVT.changeVectorElementType(
- ContainerDstVT.getVectorElementType());
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
-
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
- Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
- Val = convertFromScalableVector(DAG, SrcVT, Val);
-
- Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
- return DAG.getNode(ISD::BITCAST, DL, VT, Val);
- }
-}
-
-SDValue
-AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OpVT = Op.getValueType();
- assert(OpVT.isScalableVector() &&
- "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
- SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getMergeValues({Even, Odd}, DL);
-}
-
-SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OpVT = Op.getValueType();
- assert(OpVT.isScalableVector() &&
- "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
-
- SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getMergeValues({Lo, Hi}, DL);
-}
-
-SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
- SelectionDAG &DAG) const {
- // FIXME: Maybe share some code with LowerMGather/Scatter?
- MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
- SDLoc DL(HG);
- SDValue Chain = HG->getChain();
- SDValue Inc = HG->getInc();
- SDValue Mask = HG->getMask();
- SDValue Ptr = HG->getBasePtr();
- SDValue Index = HG->getIndex();
- SDValue Scale = HG->getScale();
- SDValue IntID = HG->getIntID();
-
- // The Intrinsic ID determines the type of update operation.
- [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
- // Right now, we only support 'add' as an update.
- assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
- "Unexpected histogram update operation");
-
- EVT IncVT = Inc.getValueType();
- EVT IndexVT = Index.getValueType();
- EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
- IndexVT.getVectorElementCount());
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
- SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
- SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
-
- MachineMemOperand *MMO = HG->getMemOperand();
- // Create an MMO for the gather, without load|store flags.
- MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
- MMO->getPointerInfo(), MachineMemOperand::MOLoad, MMO->getSize(),
- MMO->getAlign(), MMO->getAAInfo());
- ISD::MemIndexType IndexType = HG->getIndexType();
- SDValue Gather =
- DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
- GMMO, IndexType, ISD::NON_EXTLOAD);
-
- SDValue GChain = Gather.getValue(1);
-
- // Perform the histcnt, multiply by inc, add to bucket data.
- SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
- SDValue HistCnt =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
- SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
- SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
-
- // Create an MMO for the scatter, without load|store flags.
- MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
- MMO->getPointerInfo(), MachineMemOperand::MOStore, MMO->getSize(),
- MMO->getAlign(), MMO->getAAInfo());
-
- SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
- SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
- ScatterOps, SMMO, IndexType, false);
- return Scatter;
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
- unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
- : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-
- if (VT.bitsGT(SrcVT)) {
- EVT CvtVT = ContainerDstVT.changeVectorElementType(
- ContainerSrcVT.getVectorElementType());
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
-
- Val = convertToScalableVector(DAG, ContainerDstVT, Val);
- Val = getSVESafeBitCast(CvtVT, Val, DAG);
- Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
- DAG.getUNDEF(ContainerDstVT));
- return convertFromScalableVector(DAG, VT, Val);
- } else {
- EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
-
- // Safe to use a larger than specified result since an fp_to_int where the
- // result doesn't fit into the destination is undefined.
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
- Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
-
- return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
- }
-}
-
-static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
- ArrayRef<int> ShuffleMask, EVT VT,
- EVT ContainerVT, SelectionDAG &DAG) {
- auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- SDLoc DL(Op);
- unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
- bool IsSingleOp =
- ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
-
- if (!Subtarget.isNeonAvailable() && !MinSVESize)
- MinSVESize = 128;
-
- // Ignore two operands if no SVE2 or all index numbers couldn't
- // be represented.
- if (!IsSingleOp && !Subtarget.hasSVE2())
- return SDValue();
-
- EVT VTOp1 = Op.getOperand(0).getValueType();
- unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
- unsigned IndexLen = MinSVESize / BitsPerElt;
- unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
- uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
- EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
- EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
- bool MinMaxEqual = (MinSVESize == MaxSVESize);
- assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
- "Incorrectly legalised shuffle operation");
-
- SmallVector<SDValue, 8> TBLMask;
- // If MinSVESize is not equal to MaxSVESize then we need to know which
- // TBL mask element needs adjustment.
- SmallVector<SDValue, 8> AddRuntimeVLMask;
-
- // Bail out for 8-bits element types, because with 2048-bit SVE register
- // size 8 bits is only sufficient to index into the first source vector.
- if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
- return SDValue();
-
- for (int Index : ShuffleMask) {
- // Handling poison index value.
- if (Index < 0)
- Index = 0;
- // If the mask refers to elements in the second operand, then we have to
- // offset the index by the number of elements in a vector. If this is number
- // is not known at compile-time, we need to maintain a mask with 'VL' values
- // to add at runtime.
- if ((unsigned)Index >= ElementsPerVectorReg) {
- if (MinMaxEqual) {
- Index += IndexLen - ElementsPerVectorReg;
- } else {
- Index = Index - ElementsPerVectorReg;
- AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
- }
- } else if (!MinMaxEqual)
- AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
- // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
- // to 255, this might point to the last element of in the second operand
- // of the shufflevector, thus we are rejecting this transform.
- if ((unsigned)Index >= MaxOffset)
- return SDValue();
- TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
- }
-
- // Choosing an out-of-range index leads to the lane being zeroed vs zero
- // value where it would perform first lane duplication for out of
- // index elements. For i8 elements an out-of-range index could be a valid
- // for 2048-bit vector register size.
- for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
- TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
- if (!MinMaxEqual)
- AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
- }
-
- EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
- SDValue VecMask =
- DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
- SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
-
- SDValue Shuffle;
- if (IsSingleOp)
- Shuffle =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
- DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
- Op1, SVEMask);
- else if (Subtarget.hasSVE2()) {
- if (!MinMaxEqual) {
- unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
- SDValue VScale = (BitsPerElt == 64)
- ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
- : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
- SDValue VecMask =
- DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
- SDValue MulByMask = DAG.getNode(
- ISD::MUL, DL, MaskType,
- DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
- DAG.getBuildVector(MaskType, DL,
- ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
- SDValue UpdatedVecMask =
- DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
- SVEMask = convertToScalableVector(
- DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
- }
- Shuffle =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
- DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
- Op1, Op2, SVEMask);
- }
- Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
- return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
- auto ShuffleMask = SVN->getMask();
-
- SDLoc DL(Op);
- SDValue Op1 = Op.getOperand(0);
- SDValue Op2 = Op.getOperand(1);
-
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
- Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
-
- auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
- if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
- return MVT::i32;
- return ScalarTy;
- };
-
- if (SVN->isSplat()) {
- unsigned Lane = std::max(0, SVN->getSplatIndex());
- EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
- SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
- DAG.getConstant(Lane, DL, MVT::i64));
- Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- bool ReverseEXT = false;
- unsigned Imm;
- if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
- Imm == VT.getVectorNumElements() - 1) {
- if (ReverseEXT)
- std::swap(Op1, Op2);
- EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
- SDValue Scalar = DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
- DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
- Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- unsigned EltSize = VT.getScalarSizeInBits();
- for (unsigned LaneSize : {64U, 32U, 16U}) {
- if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
- EVT NewVT =
- getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
- unsigned RevOp;
- if (EltSize == 8)
- RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
- else if (EltSize == 16)
- RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
- else
- RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
-
- Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
- Op = LowerToPredicatedOp(Op, DAG, RevOp);
- Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
- return convertFromScalableVector(DAG, VT, Op);
- }
- }
-
- if (Subtarget->hasSVE2p1() && EltSize == 64 &&
- isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
- if (!VT.isFloatingPoint())
- return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
-
- EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
- Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
- Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
- Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- unsigned WhichResult;
- if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
- WhichResult == 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
-
- if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
- }
-
- if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
-
- if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
- }
-
- // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
- // represents the same logical operation as performed by a ZIP instruction. In
- // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
- // equivalent to an AArch64 instruction. There's the extra component of
- // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
- // only operated on 64/128bit vector types that have a direct mapping to a
- // target register and so an exact mapping is implied.
- // However, when using SVE for fixed length vectors, most legal vector types
- // are actually sub-vectors of a larger SVE register. When mapping
- // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
- // how the mask's indices translate. Specifically, when the mapping requires
- // an exact meaning for a specific vector index (e.g. Index X is the last
- // vector element in the register) then such mappings are often only safe when
- // the exact SVE register size is know. The main exception to this is when
- // indices are logically relative to the first element of either
- // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
- // when converting from fixed-length to scalable vector types (i.e. the start
- // of a fixed length vector is always the start of a scalable vector).
- unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
- if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
- if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
- Op2.isUndef()) {
- Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
- WhichResult != 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
-
- if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
- }
-
- if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
-
- if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
- }
- }
-
- // Avoid producing TBL instruction if we don't know SVE register minimal size,
- // unless NEON is not available and we can assume minimal SVE register size is
- // 128-bits.
- if (MinSVESize || !Subtarget->isNeonAvailable())
- return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
- DAG);
-
- return SDValue();
-}
-
-SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT InVT = Op.getValueType();
-
- assert(VT.isScalableVector() && isTypeLegal(VT) &&
- InVT.isScalableVector() && isTypeLegal(InVT) &&
- "Only expect to cast between legal scalable vector types!");
- assert(VT.getVectorElementType() != MVT::i1 &&
- InVT.getVectorElementType() != MVT::i1 &&
- "For predicate bitcasts, use getSVEPredicateBitCast");
-
- if (InVT == VT)
- return Op;
-
- EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
- EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
-
- // Safe bitcasting between unpacked vector types of different element counts
- // is currently unsupported because the following is missing the necessary
- // work to ensure the result's elements live where they're supposed to within
- // an SVE register.
- // 01234567
- // e.g. nxv2i32 = XX??XX??
- // nxv4f16 = X?X?X?X?
- assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
- VT == PackedVT || InVT == PackedInVT) &&
- "Unexpected bitcast!");
-
- // Pack input if required.
- if (InVT != PackedInVT)
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
-
- Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
-
- // Unpack result if required.
- if (VT != PackedVT)
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
-
- return Op;
-}
-
-bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
- SDValue N) const {
- return ::isAllActivePredicate(DAG, N);
-}
-
-EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
- return ::getPromotedVTForPredicate(VT);
-}
-
-bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
- SDValue Op, const APInt &OriginalDemandedBits,
- const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
- unsigned Depth) const {
-
- unsigned Opc = Op.getOpcode();
- switch (Opc) {
- case AArch64ISD::VSHL: {
- // Match (VSHL (VLSHR Val X) X)
- SDValue ShiftL = Op;
- SDValue ShiftR = Op->getOperand(0);
- if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
- return false;
-
- if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
- return false;
-
- unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
- unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
-
- // Other cases can be handled as well, but this is not
- // implemented.
- if (ShiftRBits != ShiftLBits)
- return false;
-
- unsigned ScalarSize = Op.getScalarValueSizeInBits();
- assert(ScalarSize > ShiftLBits && "Invalid shift imm");
-
- APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
- APInt UnusedBits = ~OriginalDemandedBits;
-
- if ((ZeroBits & UnusedBits) != ZeroBits)
- return false;
-
- // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
- // used - simplify to just Val.
- return TLO.CombineTo(Op, ShiftR->getOperand(0));
- }
- case AArch64ISD::BICi: {
- // Fold BICi if all destination bits already known to be zeroed
- SDValue Op0 = Op.getOperand(0);
- KnownBits KnownOp0 =
- TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
- // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
- uint64_t BitsToClear = Op->getConstantOperandVal(1)
- << Op->getConstantOperandVal(2);
- APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
- if (APInt(Known.getBitWidth(), BitsToClear)
- .isSubsetOf(AlreadyZeroedBitsToClear))
- return TLO.CombineTo(Op, Op0);
-
- Known = KnownOp0 &
- KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
-
- return false;
- }
- case ISD::INTRINSIC_WO_CHAIN: {
- if (auto ElementSize = IsSVECntIntrinsic(Op)) {
- unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
- if (!MaxSVEVectorSizeInBits)
- MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
- unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
- // The SVE count intrinsics don't support the multiplier immediate so we
- // don't have to account for that here. The value returned may be slightly
- // over the true required bits, as this is based on the "ALL" pattern. The
- // other patterns are also exposed by these intrinsics, but they all
- // return a value that's strictly less than "ALL".
- unsigned RequiredBits = llvm::bit_width(MaxElements);
- unsigned BitWidth = Known.Zero.getBitWidth();
- if (RequiredBits < BitWidth)
- Known.Zero.setHighBits(BitWidth - RequiredBits);
- return false;
- }
- }
- }
-
- return TargetLowering::SimplifyDemandedBitsForTargetNode(
- Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
-}
-
-bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
- return Op.getOpcode() == AArch64ISD::DUP ||
- Op.getOpcode() == AArch64ISD::MOVI ||
- (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
- TargetLowering::isTargetCanonicalConstantNode(Op);
-}
-
-bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
- return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
- Subtarget->hasComplxNum();
-}
-
-bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation Operation, Type *Ty) const {
- auto *VTy = dyn_cast<VectorType>(Ty);
- if (!VTy)
- return false;
-
- // If the vector is scalable, SVE is enabled, implying support for complex
- // numbers. Otherwise, we need to ensure complex number support is available
- if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
- return false;
-
- auto *ScalarTy = VTy->getScalarType();
- unsigned NumElements = VTy->getElementCount().getKnownMinValue();
-
- // We can only process vectors that have a bit size of 128 or higher (with an
- // additional 64 bits for Neon). Additionally, these vectors must have a
- // power-of-2 size, as we later split them into the smallest supported size
- // and merging them back together after applying complex operation.
- unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
- if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
- !llvm::isPowerOf2_32(VTyWidth))
- return false;
-
- if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
- unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
- return 8 <= ScalarWidth && ScalarWidth <= 64;
- }
-
- return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
- ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
-}
-
-Value *AArch64TargetLowering::createComplexDeinterleavingIR(
- IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
- ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
- Value *Accumulator) const {
- VectorType *Ty = cast<VectorType>(InputA->getType());
- bool IsScalable = Ty->isScalableTy();
- bool IsInt = Ty->getElementType()->isIntegerTy();
-
- unsigned TyWidth =
- Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
-
- assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
- "Vector type must be either 64 or a power of 2 that is at least 128");
-
- if (TyWidth > 128) {
- int Stride = Ty->getElementCount().getKnownMinValue() / 2;
- auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
- auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
- auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
- auto *UpperSplitA =
- B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
- auto *UpperSplitB =
- B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
- Value *LowerSplitAcc = nullptr;
- Value *UpperSplitAcc = nullptr;
- if (Accumulator) {
- LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
- UpperSplitAcc =
- B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
- }
- auto *LowerSplitInt = createComplexDeinterleavingIR(
- B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
- auto *UpperSplitInt = createComplexDeinterleavingIR(
- B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
-
- auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
- B.getInt64(0));
- return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
- }
-
- if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
- if (Accumulator == nullptr)
- Accumulator = Constant::getNullValue(Ty);
-
- if (IsScalable) {
- if (IsInt)
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cmla_x, Ty,
- {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
-
- auto *Mask = B.getAllOnesMask(Ty->getElementCount());
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_fcmla, Ty,
- {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
-
- Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
- Intrinsic::aarch64_neon_vcmla_rot90,
- Intrinsic::aarch64_neon_vcmla_rot180,
- Intrinsic::aarch64_neon_vcmla_rot270};
-
-
- return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
- {Accumulator, InputA, InputB});
- }
-
- if (OperationType == ComplexDeinterleavingOperation::CAdd) {
- if (IsScalable) {
- if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
- Rotation == ComplexDeinterleavingRotation::Rotation_270) {
- if (IsInt)
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cadd_x, Ty,
- {InputA, InputB, B.getInt32((int)Rotation * 90)});
-
- auto *Mask = B.getAllOnesMask(Ty->getElementCount());
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_fcadd, Ty,
- {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
- return nullptr;
- }
-
- Intrinsic::ID IntId = Intrinsic::not_intrinsic;
- if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
- IntId = Intrinsic::aarch64_neon_vcadd_rot90;
- else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
- IntId = Intrinsic::aarch64_neon_vcadd_rot270;
-
- if (IntId == Intrinsic::not_intrinsic)
- return nullptr;
-
- return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
- }
-
- return nullptr;
-}
-
-bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
- unsigned Opc = N->getOpcode();
- if (ISD::isExtOpcode(Opc)) {
- if (any_of(N->uses(),
- [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
- return false;
- }
- return true;
-}
-
-unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
- return Subtarget->getMinimumJumpTableEntries();
-}
-
-MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv::ID CC,
- EVT VT) const {
- bool NonUnitFixedLengthVector =
- VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
- if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
-
- EVT VT1;
- MVT RegisterVT;
- unsigned NumIntermediates;
- getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
- RegisterVT);
- return RegisterVT;
-}
-
-unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
- bool NonUnitFixedLengthVector =
- VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
- if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
- return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
-
- EVT VT1;
- MVT VT2;
- unsigned NumIntermediates;
- return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
- NumIntermediates, VT2);
-}
-
-unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
- unsigned &NumIntermediates, MVT &RegisterVT) const {
- int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
- Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
- if (!RegisterVT.isFixedLengthVector() ||
- RegisterVT.getFixedSizeInBits() <= 128)
- return NumRegs;
-
- assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
- assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
- assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
-
- // A size mismatch here implies either type promotion or widening and would
- // have resulted in scalarisation if larger vectors had not be available.
- if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
- EVT EltTy = VT.getVectorElementType();
- EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
- if (!isTypeLegal(NewVT))
- NewVT = EltTy;
-
- IntermediateVT = NewVT;
- NumIntermediates = VT.getVectorNumElements();
- RegisterVT = getRegisterType(Context, NewVT);
- return NumIntermediates;
- }
-
- // SVE VLS support does not introduce a new ABI so we should use NEON sized
- // types for vector arguments and returns.
-
- unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
- NumIntermediates *= NumSubRegs;
- NumRegs *= NumSubRegs;
-
- switch (RegisterVT.getVectorElementType().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for vector");
- case MVT::i8:
- IntermediateVT = RegisterVT = MVT::v16i8;
- break;
- case MVT::i16:
- IntermediateVT = RegisterVT = MVT::v8i16;
- break;
- case MVT::i32:
- IntermediateVT = RegisterVT = MVT::v4i32;
- break;
- case MVT::i64:
- IntermediateVT = RegisterVT = MVT::v2i64;
- break;
- case MVT::f16:
- IntermediateVT = RegisterVT = MVT::v8f16;
- break;
- case MVT::f32:
- IntermediateVT = RegisterVT = MVT::v4f32;
- break;
- case MVT::f64:
- IntermediateVT = RegisterVT = MVT::v2f64;
- break;
- case MVT::bf16:
- IntermediateVT = RegisterVT = MVT::v8bf16;
- break;
- }
-
- return NumRegs;
-}
-
-bool AArch64TargetLowering::hasInlineStackProbe(
- const MachineFunction &MF) const {
- return !Subtarget->isTargetWindows() &&
- MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
-}
-
-#ifndef NDEBUG
-void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
- switch (N->getOpcode()) {
- default:
- break;
- case AArch64ISD::SUNPKLO:
- case AArch64ISD::SUNPKHI:
- case AArch64ISD::UUNPKLO:
- case AArch64ISD::UUNPKHI: {
- assert(N->getNumValues() == 1 && "Expected one result!");
- assert(N->getNumOperands() == 1 && "Expected one operand!");
- EVT VT = N->getValueType(0);
- EVT OpVT = N->getOperand(0).getValueType();
- assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
- VT.isInteger() && "Expected integer vectors!");
- assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
- "Expected vectors of equal size!");
- // TODO: Enable assert once bogus creations have been fixed.
- // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
- // "Expected result vector with half the lanes of its input!");
- break;
- }
- case AArch64ISD::TRN1:
- case AArch64ISD::TRN2:
- case AArch64ISD::UZP1:
- case AArch64ISD::UZP2:
- case AArch64ISD::ZIP1:
- case AArch64ISD::ZIP2: {
- assert(N->getNumValues() == 1 && "Expected one result!");
- assert(N->getNumOperands() == 2 && "Expected two operands!");
- EVT VT = N->getValueType(0);
- EVT Op0VT = N->getOperand(0).getValueType();
- EVT Op1VT = N->getOperand(1).getValueType();
- assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
- "Expected vectors!");
- // TODO: Enable assert once bogus creations have been fixed.
- // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
- break;
- }
- }
-}
-#endif
+Function *AArch64TargetLowering::getSSPStackGuardCheck(const M
\ No newline at end of file
>From d2844a8786a4fa6878d872426858b93ab80211ce Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234 at gmail.com>
Date: Sat, 29 Jun 2024 10:20:23 -0400
Subject: [PATCH 4/5] Update AArch64ISelLowering.cpp
---
.../Target/AArch64/AArch64ISelLowering.cpp | 1966 ++++++++++++++++-
1 file changed, 1965 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d96276c8b6de6..5c6453ed81f2b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26538,4 +26538,1968 @@ Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
return TargetLowering::getSDagStackGuard(M);
}
-Function *AArch64TargetLowering::getSSPStackGuardCheck(const M
\ No newline at end of file
+Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
+ // MSVC CRT has a function to validate security cookie.
+ if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
+ return M.getFunction(Subtarget->getSecurityCheckCookieName());
+ return TargetLowering::getSSPStackGuardCheck(M);
+}
+
+Value *
+AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
+ // Android provides a fixed TLS slot for the SafeStack pointer. See the
+ // definition of TLS_SLOT_SAFESTACK in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ if (Subtarget->isTargetAndroid())
+ return UseTlsOffset(IRB, 0x48);
+
+ // Fuchsia is similar.
+ // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
+ if (Subtarget->isTargetFuchsia())
+ return UseTlsOffset(IRB, -0x8);
+
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+ // this is likely to be fold the and/cmp/br into a single tbz instruction. It
+ // may be beneficial to sink in other cases, but we would have to check that
+ // the cmp would not get folded into the br to form a cbz for these to be
+ // beneficial.
+ ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+ if (!Mask)
+ return false;
+ return Mask->getValue().isPowerOf2();
+}
+
+bool AArch64TargetLowering::
+ shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const {
+ // Does baseline recommend not to perform the fold by default?
+ if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
+ return false;
+ // Else, if this is a vector shift, prefer 'shl'.
+ return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
+}
+
+TargetLowering::ShiftLegalizationStrategy
+AArch64TargetLowering::preferredShiftLegalizationStrategy(
+ SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
+ if (DAG.getMachineFunction().getFunction().hasMinSize() &&
+ !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
+ return ShiftLegalizationStrategy::LowerToLibcall;
+ return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
+ ExpansionFactor);
+}
+
+void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+ // Update IsSplitCSR in AArch64unctionInfo.
+ AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
+ AFI->setIsSplitCSR(true);
+}
+
+void AArch64TargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AArch64::GPR64RegClass.contains(*I))
+ RC = &AArch64::GPR64RegClass;
+ else if (AArch64::FPR64RegClass.contains(*I))
+ RC = &AArch64::FPR64RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ Register NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ // FIXME: this currently does not emit CFI pseudo-instructions, it works
+ // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
+ // nounwind. If we want to generalize this later, we may need to emit
+ // CFI pseudo-instructions.
+ assert(Entry->getParent()->getFunction().hasFnAttribute(
+ Attribute::NoUnwind) &&
+ "Function should be nounwind in insertCopiesSplitCSR!");
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
+ // Integer division on AArch64 is expensive. However, when aggressively
+ // optimizing for code size, we prefer to use a div instruction, as it is
+ // usually smaller than the alternative sequence.
+ // The exception to this is vector division. Since AArch64 doesn't have vector
+ // integer division, leaving the division as-is is a loss even in terms of
+ // size, because it will have to be scalarized, while the alternative code
+ // sequence can be performed in vector form.
+ bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
+ return OptSize && !VT.isVector();
+}
+
+bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
+ // We want inc-of-add for scalars and sub-of-not for vectors.
+ return VT.isScalarInteger();
+}
+
+bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
+ EVT VT) const {
+ // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
+ // legalize.
+ if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
+ return false;
+ if (FPVT == MVT::v8bf16)
+ return false;
+ return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
+}
+
+MachineInstr *
+AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &MBBI,
+ const TargetInstrInfo *TII) const {
+ assert(MBBI->isCall() && MBBI->getCFIType() &&
+ "Invalid call instruction for a KCFI check");
+
+ switch (MBBI->getOpcode()) {
+ case AArch64::BLR:
+ case AArch64::BLRNoIP:
+ case AArch64::TCRETURNri:
+ case AArch64::TCRETURNrix16x17:
+ case AArch64::TCRETURNrix17:
+ case AArch64::TCRETURNrinotx16:
+ break;
+ default:
+ llvm_unreachable("Unexpected CFI call opcode");
+ }
+
+ MachineOperand &Target = MBBI->getOperand(0);
+ assert(Target.isReg() && "Invalid target operand for an indirect call");
+ Target.setIsRenamable(false);
+
+ return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
+ .addReg(Target.getReg())
+ .addImm(MBBI->getCFIType())
+ .getInstr();
+}
+
+bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
+ return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
+}
+
+unsigned
+AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+ return getPointerTy(DL).getSizeInBits();
+
+ return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
+}
+
+void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ // If we have any vulnerable SVE stack objects then the stack protector
+ // needs to be placed at the top of the SVE stack area, as the SVE locals
+ // are placed above the other locals, so we allocate it as if it were a
+ // scalable vector.
+ // FIXME: It may be worthwhile having a specific interface for this rather
+ // than doing it here in finalizeLowering.
+ if (MFI.hasStackProtectorIndex()) {
+ for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+ if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
+ MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
+ MFI.setStackID(MFI.getStackProtectorIndex(),
+ TargetStackID::ScalableVector);
+ MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
+ break;
+ }
+ }
+ }
+ MFI.computeMaxCallFrameSize(MF);
+ TargetLoweringBase::finalizeLowering(MF);
+}
+
+// Unlike X86, we let frame lowering assign offsets to all catch objects.
+bool AArch64TargetLowering::needsFixedCatchObjects() const {
+ return false;
+}
+
+bool AArch64TargetLowering::shouldLocalize(
+ const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+ auto &MF = *MI.getMF();
+ auto &MRI = MF.getRegInfo();
+ auto maxUses = [](unsigned RematCost) {
+ // A cost of 1 means remats are basically free.
+ if (RematCost == 1)
+ return std::numeric_limits<unsigned>::max();
+ if (RematCost == 2)
+ return 2U;
+
+ // Remat is too expensive, only sink if there's one user.
+ if (RematCost > 2)
+ return 1U;
+ llvm_unreachable("Unexpected remat cost");
+ };
+
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ // On Darwin, TLS global vars get selected into function calls, which
+ // we don't want localized, as they can get moved into the middle of a
+ // another call sequence.
+ const GlobalValue &GV = *MI.getOperand(1).getGlobal();
+ if (GV.isThreadLocal() && Subtarget->isTargetMachO())
+ return false;
+ return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
+ }
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_CONSTANT: {
+ const ConstantInt *CI;
+ unsigned AdditionalCost = 0;
+
+ if (Opc == TargetOpcode::G_CONSTANT)
+ CI = MI.getOperand(1).getCImm();
+ else {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ // We try to estimate cost of 32/64b fpimms, as they'll likely be
+ // materialized as integers.
+ if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
+ break;
+ auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
+ bool OptForSize =
+ MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize();
+ if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),
+ OptForSize))
+ return true; // Constant should be cheap.
+ CI =
+ ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
+ // FP materialization also costs an extra move, from gpr to fpr.
+ AdditionalCost = 1;
+ }
+ APInt Imm = CI->getValue();
+ InstructionCost Cost = TTI->getIntImmCost(
+ Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
+ assert(Cost.isValid() && "Expected a valid imm cost");
+
+ unsigned RematCost = *Cost.getValue();
+ RematCost += AdditionalCost;
+ Register Reg = MI.getOperand(0).getReg();
+ unsigned MaxUses = maxUses(RematCost);
+ // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
+ if (MaxUses == std::numeric_limits<unsigned>::max())
+ --MaxUses;
+ return MRI.hasAtMostUserInstrs(Reg, MaxUses);
+ }
+ // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
+ // localizable.
+ case AArch64::ADRP:
+ case AArch64::G_ADD_LOW:
+ // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
+ case TargetOpcode::G_PTR_ADD:
+ return true;
+ default:
+ break;
+ }
+ return TargetLoweringBase::shouldLocalize(MI, TTI);
+}
+
+bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+ // Fallback for scalable vectors.
+ // Note that if EnableSVEGISel is true, we allow scalable vector types for
+ // all instructions, regardless of whether they are actually supported.
+ if (!EnableSVEGISel) {
+ if (Inst.getType()->isScalableTy()) {
+ return true;
+ }
+
+ for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+ if (Inst.getOperand(i)->getType()->isScalableTy())
+ return true;
+
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+ if (AI->getAllocatedType()->isScalableTy())
+ return true;
+ }
+ }
+
+ // Checks to allow the use of SME instructions
+ if (auto *Base = dyn_cast<CallBase>(&Inst)) {
+ auto CallerAttrs = SMEAttrs(*Inst.getFunction());
+ auto CalleeAttrs = SMEAttrs(*Base);
+ if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
+ CallerAttrs.requiresLazySave(CalleeAttrs) ||
+ CallerAttrs.requiresPreservingZT0(CalleeAttrs))
+ return true;
+ }
+ return false;
+}
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE container");
+ case MVT::i8:
+ return EVT(MVT::nxv16i8);
+ case MVT::i16:
+ return EVT(MVT::nxv8i16);
+ case MVT::i32:
+ return EVT(MVT::nxv4i32);
+ case MVT::i64:
+ return EVT(MVT::nxv2i64);
+ case MVT::bf16:
+ return EVT(MVT::nxv8bf16);
+ case MVT::f16:
+ return EVT(MVT::nxv8f16);
+ case MVT::f32:
+ return EVT(MVT::nxv4f32);
+ case MVT::f64:
+ return EVT(MVT::nxv2f64);
+ }
+}
+
+// Return a PTRUE with active lanes corresponding to the extent of VT.
+static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isFixedLengthVector() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal fixed length vector!");
+
+ std::optional<unsigned> PgPattern =
+ getSVEPredPatternFromNumElements(VT.getVectorNumElements());
+ assert(PgPattern && "Unexpected element count for SVE predicate");
+
+ // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
+ // AArch64SVEPredPattern::all, which can enable the use of unpredicated
+ // variants of instructions when available.
+ const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+ unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+ if (MaxSVESize && MinSVESize == MaxSVESize &&
+ MaxSVESize == VT.getSizeInBits())
+ PgPattern = AArch64SVEPredPattern::all;
+
+ MVT MaskVT;
+ switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for SVE predicate");
+ case MVT::i8:
+ MaskVT = MVT::nxv16i1;
+ break;
+ case MVT::i16:
+ case MVT::f16:
+ case MVT::bf16:
+ MaskVT = MVT::nxv8i1;
+ break;
+ case MVT::i32:
+ case MVT::f32:
+ MaskVT = MVT::nxv4i1;
+ break;
+ case MVT::i64:
+ case MVT::f64:
+ MaskVT = MVT::nxv2i1;
+ break;
+ }
+
+ return getPTrue(DAG, DL, MaskVT, *PgPattern);
+}
+
+static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
+ EVT VT) {
+ assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+ "Expected legal scalable vector!");
+ auto PredTy = VT.changeVectorElementType(MVT::i1);
+ return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
+}
+
+static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
+ if (VT.isFixedLengthVector())
+ return getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ return getPredicateForScalableVector(DAG, DL, VT);
+}
+
+// Grow V to consume an entire SVE register.
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isScalableVector() &&
+ "Expected to convert into a scalable vector!");
+ assert(V.getValueType().isFixedLengthVector() &&
+ "Expected a fixed length vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+ assert(VT.isFixedLengthVector() &&
+ "Expected to convert into a fixed length vector!");
+ assert(V.getValueType().isScalableVector() &&
+ "Expected a scalable vector operand!");
+ SDLoc DL(V);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Load = cast<LoadSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT LoadVT = ContainerVT;
+ EVT MemVT = Load->getMemoryVT();
+
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ if (VT.isFloatingPoint()) {
+ LoadVT = ContainerVT.changeTypeToInteger();
+ MemVT = MemVT.changeTypeToInteger();
+ }
+
+ SDValue NewLoad = DAG.getMaskedLoad(
+ LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
+ DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
+ Load->getAddressingMode(), Load->getExtensionType());
+
+ SDValue Result = NewLoad;
+ if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
+ EVT ExtendVT = ContainerVT.changeVectorElementType(
+ Load->getMemoryVT().getVectorElementType());
+
+ Result = getSVESafeBitCast(ExtendVT, Result, DAG);
+ Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, Result, DAG.getUNDEF(ContainerVT));
+ } else if (VT.isFloatingPoint()) {
+ Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
+ }
+
+ Result = convertFromScalableVector(DAG, VT, Result);
+ SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
+static SDValue convertFixedMaskToScalableVector(SDValue Mask,
+ SelectionDAG &DAG) {
+ SDLoc DL(Mask);
+ EVT InVT = Mask.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+ if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+ return Pg;
+
+ auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
+ auto Op2 = DAG.getConstant(0, DL, ContainerVT);
+
+ return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
+ {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Load = cast<MaskedLoadSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ SDValue Mask = Load->getMask();
+ // If this is an extending load and the mask type is not the same as
+ // load's type then we have to extend the mask type.
+ if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
+ assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
+ "Incorrect mask type");
+ Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
+ }
+ Mask = convertFixedMaskToScalableVector(Mask, DAG);
+
+ SDValue PassThru;
+ bool IsPassThruZeroOrUndef = false;
+
+ if (Load->getPassThru()->isUndef()) {
+ PassThru = DAG.getUNDEF(ContainerVT);
+ IsPassThruZeroOrUndef = true;
+ } else {
+ if (ContainerVT.isInteger())
+ PassThru = DAG.getConstant(0, DL, ContainerVT);
+ else
+ PassThru = DAG.getConstantFP(0, DL, ContainerVT);
+ if (isZerosVector(Load->getPassThru().getNode()))
+ IsPassThruZeroOrUndef = true;
+ }
+
+ SDValue NewLoad = DAG.getMaskedLoad(
+ ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+ Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
+ Load->getAddressingMode(), Load->getExtensionType());
+
+ SDValue Result = NewLoad;
+ if (!IsPassThruZeroOrUndef) {
+ SDValue OldPassThru =
+ convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
+ Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
+ }
+
+ Result = convertFromScalableVector(DAG, VT, Result);
+ SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
+ return DAG.getMergeValues(MergedValues, DL);
+}
+
+// Convert all fixed length vector stores larger than NEON to masked_stores.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto Store = cast<StoreSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Store->getValue().getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT MemVT = Store->getMemoryVT();
+
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+ auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+
+ if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
+ EVT TruncVT = ContainerVT.changeVectorElementType(
+ Store->getMemoryVT().getVectorElementType());
+ MemVT = MemVT.changeTypeToInteger();
+ NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
+ NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
+ DAG.getUNDEF(TruncVT));
+ NewValue =
+ getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+ } else if (VT.isFloatingPoint()) {
+ MemVT = MemVT.changeTypeToInteger();
+ NewValue =
+ getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
+ }
+
+ return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
+ Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
+ Store->getMemOperand(), Store->getAddressingMode(),
+ Store->isTruncatingStore());
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ auto *Store = cast<MaskedStoreSDNode>(Op);
+
+ SDLoc DL(Op);
+ EVT VT = Store->getValue().getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+ SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
+
+ return DAG.getMaskedStore(
+ Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+ Mask, Store->getMemoryVT(), Store->getMemOperand(),
+ Store->getAddressingMode(), Store->isTruncatingStore());
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+
+ bool Signed = Op.getOpcode() == ISD::SDIV;
+ unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
+
+ bool Negated;
+ uint64_t SplatVal;
+ if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+ SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
+
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
+ SDValue Res =
+ DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
+ if (Negated)
+ Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
+ DAG.getConstant(0, dl, ContainerVT), Res);
+
+ return convertFromScalableVector(DAG, VT, Res);
+ }
+
+ // Scalable vector i32/i64 DIV is supported.
+ if (EltVT == MVT::i32 || EltVT == MVT::i64)
+ return LowerToPredicatedOp(Op, DAG, PredOpcode);
+
+ // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
+ unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ // If the wider type is legal: extend, op, and truncate.
+ EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+ if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
+ SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
+ SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
+ SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
+ }
+
+ auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
+ &ExtendOpcode](SDValue Op) {
+ SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
+ SDValue IdxHalf =
+ DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
+ return std::pair<SDValue, SDValue>(
+ {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
+ DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
+ };
+
+ // If wider type is not legal: split, extend, op, trunc and concat.
+ auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
+ auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
+ SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
+ SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
+ SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
+ SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+ Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+ bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
+ unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
+
+ // Repeatedly unpack Val until the result is of the desired element type.
+ switch (ContainerVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unimplemented container type");
+ case MVT::nxv16i8:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
+ if (VT.getVectorElementType() == MVT::i16)
+ break;
+ [[fallthrough]];
+ case MVT::nxv8i16:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
+ if (VT.getVectorElementType() == MVT::i32)
+ break;
+ [[fallthrough]];
+ case MVT::nxv4i32:
+ Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
+ assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
+ break;
+ }
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+ Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+ // Repeatedly truncate Val until the result is of the desired element type.
+ switch (ContainerVT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("unimplemented container type");
+ case MVT::nxv2i64:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
+ if (VT.getVectorElementType() == MVT::i32)
+ break;
+ [[fallthrough]];
+ case MVT::nxv4i32:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
+ if (VT.getVectorElementType() == MVT::i16)
+ break;
+ [[fallthrough]];
+ case MVT::nxv8i16:
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
+ Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
+ assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
+ break;
+ }
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT InVT = Op.getOperand(0).getValueType();
+ assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
+
+ auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
+ Op.getOperand(1), Op.getOperand(2));
+
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+// Convert vector operation 'Op' to an equivalent predicated operation whereby
+// the original operation's type is used to construct a suitable predicate.
+// NOTE: The results for inactive lanes are undefined.
+SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned NewOp) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ auto Pg = getPredicateForVector(DAG, DL, VT);
+
+ if (VT.isFixedLengthVector()) {
+ assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ // Create list of operands by converting existing ones to scalable types.
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ if (isa<CondCodeSDNode>(V)) {
+ Operands.push_back(V);
+ continue;
+ }
+
+ if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
+ EVT VTArg = VTNode->getVT().getVectorElementType();
+ EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
+ Operands.push_back(DAG.getValueType(NewVTArg));
+ continue;
+ }
+
+ assert(isTypeLegal(V.getValueType()) &&
+ "Expected only legal fixed-width types");
+ Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
+ }
+
+ if (isMergePassthruOpcode(NewOp))
+ Operands.push_back(DAG.getUNDEF(ContainerVT));
+
+ auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+ }
+
+ assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
+
+ SmallVector<SDValue, 4> Operands = {Pg};
+ for (const SDValue &V : Op->op_values()) {
+ assert((!V.getValueType().isVector() ||
+ V.getValueType().isScalableVector()) &&
+ "Only scalable vectors are supported!");
+ Operands.push_back(V);
+ }
+
+ if (isMergePassthruOpcode(NewOp))
+ Operands.push_back(DAG.getUNDEF(VT));
+
+ return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
+}
+
+// If a fixed length vector operation has no side effects when applied to
+// undefined elements, we can safely use scalable vectors to perform the same
+// operation without needing to worry about predication.
+SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
+ "Only expected to lower fixed length vector operation!");
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ // Create list of operands by converting existing ones to scalable types.
+ SmallVector<SDValue, 4> Ops;
+ for (const SDValue &V : Op->op_values()) {
+ assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+
+ // Pass through non-vector operands.
+ if (!V.getValueType().isVector()) {
+ Ops.push_back(V);
+ continue;
+ }
+
+ // "cast" fixed length vector to a scalable vector.
+ assert(V.getValueType().isFixedLengthVector() &&
+ isTypeLegal(V.getValueType()) &&
+ "Only fixed length vectors are supported!");
+ Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
+ }
+
+ auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ScalarOp);
+ SDValue AccOp = ScalarOp.getOperand(0);
+ SDValue VecOp = ScalarOp.getOperand(1);
+ EVT SrcVT = VecOp.getValueType();
+ EVT ResVT = SrcVT.getVectorElementType();
+
+ EVT ContainerVT = SrcVT;
+ if (SrcVT.isFixedLengthVector()) {
+ ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+
+ SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+
+ // Convert operands to Scalable.
+ AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), AccOp, Zero);
+
+ // Perform reduction.
+ SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
+ Pg, AccOp, VecOp);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
+}
+
+SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ReduceOp);
+ SDValue Op = ReduceOp.getOperand(0);
+ EVT OpVT = Op.getValueType();
+ EVT VT = ReduceOp.getValueType();
+
+ if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
+
+ switch (ReduceOp.getOpcode()) {
+ default:
+ return SDValue();
+ case ISD::VECREDUCE_OR:
+ if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
+ // The predicate can be 'Op' because
+ // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
+ return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
+ else
+ return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
+ case ISD::VECREDUCE_AND: {
+ Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
+ return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
+ }
+ case ISD::VECREDUCE_XOR: {
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
+ if (OpVT == MVT::nxv1i1) {
+ // Emulate a CNTP on .Q using .D and a different governing predicate.
+ Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
+ }
+ SDValue Cntp =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
+ return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
+ }
+ }
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
+ SDValue ScalarOp,
+ SelectionDAG &DAG) const {
+ SDLoc DL(ScalarOp);
+ SDValue VecOp = ScalarOp.getOperand(0);
+ EVT SrcVT = VecOp.getValueType();
+
+ if (useSVEForFixedLengthVectorVT(
+ SrcVT,
+ /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+ }
+
+ // UADDV always returns an i64 result.
+ EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
+ SrcVT.getVectorElementType();
+ EVT RdxVT = SrcVT;
+ if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
+ RdxVT = getPackedSVEVectorVT(ResVT);
+
+ SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
+ SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
+ Rdx, DAG.getConstant(0, DL, MVT::i64));
+
+ // The VEC_REDUCE nodes expect an element size result.
+ if (ResVT != ScalarOp.getValueType())
+ Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
+
+ return Res;
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+
+ EVT InVT = Op.getOperand(1).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+ SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
+ SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
+
+ // Convert the mask to a predicated (NOTE: We don't need to worry about
+ // inactive lanes since VSELECT is safe when given undefined elements).
+ EVT MaskVT = Op.getOperand(0).getValueType();
+ EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
+ auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
+ Mask = DAG.getNode(ISD::TRUNCATE, DL,
+ MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
+
+ auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
+ Mask, Op1, Op2);
+
+ return convertFromScalableVector(DAG, VT, ScalableRes);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT InVT = Op.getOperand(0).getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
+
+ assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
+ "Only expected to lower fixed length vector operation!");
+ assert(Op.getValueType() == InVT.changeTypeToInteger() &&
+ "Expected integer result of the same bit length as the inputs!");
+
+ auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
+ auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
+ auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
+
+ EVT CmpVT = Pg.getValueType();
+ auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
+ {Pg, Op1, Op2, Op.getOperand(2)});
+
+ EVT PromoteVT = ContainerVT.changeTypeToInteger();
+ auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
+ return convertFromScalableVector(DAG, Op.getValueType(), Promote);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ auto SrcOp = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT =
+ getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
+
+ SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
+ Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
+ return convertFromScalableVector(DAG, VT, Op);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ unsigned NumOperands = Op->getNumOperands();
+
+ assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
+ "Unexpected number of operands in CONCAT_VECTORS");
+
+ auto SrcOp1 = Op.getOperand(0);
+ auto SrcOp2 = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ EVT SrcVT = SrcOp1.getValueType();
+
+ if (NumOperands > 2) {
+ SmallVector<SDValue, 4> Ops;
+ EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+ for (unsigned I = 0; I < NumOperands; I += 2)
+ Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
+ Op->getOperand(I), Op->getOperand(I + 1)));
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
+ }
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
+ SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
+ SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
+
+ Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
+
+ return convertFromScalableVector(DAG, VT, Op);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ SDValue Pg = getPredicateForVector(DAG, DL, VT);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ExtendVT = ContainerVT.changeVectorElementType(
+ SrcVT.getVectorElementType());
+
+ Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
+
+ Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
+ Val = getSVESafeBitCast(ExtendVT, Val, DAG);
+ Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
+ Pg, Val, DAG.getUNDEF(ContainerVT));
+
+ return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+ EVT RoundVT = ContainerSrcVT.changeVectorElementType(
+ VT.getVectorElementType());
+ SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
+ Op.getOperand(1), DAG.getUNDEF(RoundVT));
+ Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
+ Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
+
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Val);
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
+ unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
+ : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+
+ if (VT.bitsGE(SrcVT)) {
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
+ VT.changeTypeToInteger(), Val);
+
+ // Safe to use a larger than specified operand because by promoting the
+ // value nothing has changed from an arithmetic point of view.
+ Val =
+ convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
+ Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
+ DAG.getUNDEF(ContainerDstVT));
+ return convertFromScalableVector(DAG, VT, Val);
+ } else {
+ EVT CvtVT = ContainerSrcVT.changeVectorElementType(
+ ContainerDstVT.getVectorElementType());
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
+
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
+ Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
+ Val = convertFromScalableVector(DAG, SrcVT, Val);
+
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Val);
+ }
+}
+
+SDValue
+AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT OpVT = Op.getValueType();
+ assert(OpVT.isScalableVector() &&
+ "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+ SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
+ Op.getOperand(1));
+ SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getMergeValues({Even, Odd}, DL);
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT OpVT = Op.getValueType();
+ assert(OpVT.isScalableVector() &&
+ "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+
+ SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
+ Op.getOperand(1));
+ SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
+ Op.getOperand(1));
+ return DAG.getMergeValues({Lo, Hi}, DL);
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
+ SelectionDAG &DAG) const {
+ // FIXME: Maybe share some code with LowerMGather/Scatter?
+ MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
+ SDLoc DL(HG);
+ SDValue Chain = HG->getChain();
+ SDValue Inc = HG->getInc();
+ SDValue Mask = HG->getMask();
+ SDValue Ptr = HG->getBasePtr();
+ SDValue Index = HG->getIndex();
+ SDValue Scale = HG->getScale();
+ SDValue IntID = HG->getIntID();
+
+ // The Intrinsic ID determines the type of update operation.
+ [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
+ // Right now, we only support 'add' as an update.
+ assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
+ "Unexpected histogram update operation");
+
+ EVT IncVT = Inc.getValueType();
+ EVT IndexVT = Index.getValueType();
+ EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
+ IndexVT.getVectorElementCount());
+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+ SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
+ SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
+ SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
+
+ MachineMemOperand *MMO = HG->getMemOperand();
+ // Create an MMO for the gather, without load|store flags.
+ MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MMO->getPointerInfo(), MachineMemOperand::MOLoad, MMO->getSize(),
+ MMO->getAlign(), MMO->getAAInfo());
+ ISD::MemIndexType IndexType = HG->getIndexType();
+ SDValue Gather =
+ DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
+ GMMO, IndexType, ISD::NON_EXTLOAD);
+
+ SDValue GChain = Gather.getValue(1);
+
+ // Perform the histcnt, multiply by inc, add to bucket data.
+ SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
+ SDValue HistCnt =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
+
+ // Create an MMO for the scatter, without load|store flags.
+ MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MMO->getPointerInfo(), MachineMemOperand::MOStore, MMO->getSize(),
+ MMO->getAlign(), MMO->getAAInfo());
+
+ SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
+ SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
+ ScatterOps, SMMO, IndexType, false);
+ return Scatter;
+}
+
+SDValue
+AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
+ unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
+ : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
+
+ SDLoc DL(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT SrcVT = Val.getValueType();
+ EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
+ EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
+
+ if (VT.bitsGT(SrcVT)) {
+ EVT CvtVT = ContainerDstVT.changeVectorElementType(
+ ContainerSrcVT.getVectorElementType());
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
+
+ Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
+
+ Val = convertToScalableVector(DAG, ContainerDstVT, Val);
+ Val = getSVESafeBitCast(CvtVT, Val, DAG);
+ Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
+ DAG.getUNDEF(ContainerDstVT));
+ return convertFromScalableVector(DAG, VT, Val);
+ } else {
+ EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
+ SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
+
+ // Safe to use a larger than specified result since an fp_to_int where the
+ // result doesn't fit into the destination is undefined.
+ Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
+ Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
+ Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
+
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
+ }
+}
+
+static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
+ ArrayRef<int> ShuffleMask, EVT VT,
+ EVT ContainerVT, SelectionDAG &DAG) {
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ SDLoc DL(Op);
+ unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+ unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+ bool IsSingleOp =
+ ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
+
+ if (!Subtarget.isNeonAvailable() && !MinSVESize)
+ MinSVESize = 128;
+
+ // Ignore two operands if no SVE2 or all index numbers couldn't
+ // be represented.
+ if (!IsSingleOp && !Subtarget.hasSVE2())
+ return SDValue();
+
+ EVT VTOp1 = Op.getOperand(0).getValueType();
+ unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
+ unsigned IndexLen = MinSVESize / BitsPerElt;
+ unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
+ uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
+ EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
+ EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
+ bool MinMaxEqual = (MinSVESize == MaxSVESize);
+ assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
+ "Incorrectly legalised shuffle operation");
+
+ SmallVector<SDValue, 8> TBLMask;
+ // If MinSVESize is not equal to MaxSVESize then we need to know which
+ // TBL mask element needs adjustment.
+ SmallVector<SDValue, 8> AddRuntimeVLMask;
+
+ // Bail out for 8-bits element types, because with 2048-bit SVE register
+ // size 8 bits is only sufficient to index into the first source vector.
+ if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
+ return SDValue();
+
+ for (int Index : ShuffleMask) {
+ // Handling poison index value.
+ if (Index < 0)
+ Index = 0;
+ // If the mask refers to elements in the second operand, then we have to
+ // offset the index by the number of elements in a vector. If this is number
+ // is not known at compile-time, we need to maintain a mask with 'VL' values
+ // to add at runtime.
+ if ((unsigned)Index >= ElementsPerVectorReg) {
+ if (MinMaxEqual) {
+ Index += IndexLen - ElementsPerVectorReg;
+ } else {
+ Index = Index - ElementsPerVectorReg;
+ AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
+ }
+ } else if (!MinMaxEqual)
+ AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
+ // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
+ // to 255, this might point to the last element of in the second operand
+ // of the shufflevector, thus we are rejecting this transform.
+ if ((unsigned)Index >= MaxOffset)
+ return SDValue();
+ TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
+ }
+
+ // Choosing an out-of-range index leads to the lane being zeroed vs zero
+ // value where it would perform first lane duplication for out of
+ // index elements. For i8 elements an out-of-range index could be a valid
+ // for 2048-bit vector register size.
+ for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
+ TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
+ if (!MinMaxEqual)
+ AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
+ }
+
+ EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
+ SDValue VecMask =
+ DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
+ SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
+
+ SDValue Shuffle;
+ if (IsSingleOp)
+ Shuffle =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+ DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
+ Op1, SVEMask);
+ else if (Subtarget.hasSVE2()) {
+ if (!MinMaxEqual) {
+ unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
+ SDValue VScale = (BitsPerElt == 64)
+ ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
+ : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
+ SDValue VecMask =
+ DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
+ SDValue MulByMask = DAG.getNode(
+ ISD::MUL, DL, MaskType,
+ DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
+ DAG.getBuildVector(MaskType, DL,
+ ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
+ SDValue UpdatedVecMask =
+ DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
+ SVEMask = convertToScalableVector(
+ DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
+ }
+ Shuffle =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+ DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
+ Op1, Op2, SVEMask);
+ }
+ Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
+ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
+ SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+ auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+ auto ShuffleMask = SVN->getMask();
+
+ SDLoc DL(Op);
+ SDValue Op1 = Op.getOperand(0);
+ SDValue Op2 = Op.getOperand(1);
+
+ EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+ Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
+ Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
+
+ auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
+ if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
+ return MVT::i32;
+ return ScalarTy;
+ };
+
+ if (SVN->isSplat()) {
+ unsigned Lane = std::max(0, SVN->getSplatIndex());
+ EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
+ SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
+ DAG.getConstant(Lane, DL, MVT::i64));
+ Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
+ bool ReverseEXT = false;
+ unsigned Imm;
+ if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
+ Imm == VT.getVectorNumElements() - 1) {
+ if (ReverseEXT)
+ std::swap(Op1, Op2);
+ EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
+ SDValue Scalar = DAG.getNode(
+ ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
+ DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
+ Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
+ unsigned EltSize = VT.getScalarSizeInBits();
+ for (unsigned LaneSize : {64U, 32U, 16U}) {
+ if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
+ EVT NewVT =
+ getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
+ unsigned RevOp;
+ if (EltSize == 8)
+ RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
+ else if (EltSize == 16)
+ RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
+ else
+ RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
+
+ Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
+ Op = LowerToPredicatedOp(Op, DAG, RevOp);
+ Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+ }
+
+ if (Subtarget->hasSVE2p1() && EltSize == 64 &&
+ isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
+ if (!VT.isFloatingPoint())
+ return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
+
+ EVT NewVT = getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), 64));
+ Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
+ Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
+ Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
+ unsigned WhichResult;
+ if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
+ WhichResult == 0)
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
+
+ if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+ }
+
+ if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
+
+ if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
+ }
+
+ // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
+ // represents the same logical operation as performed by a ZIP instruction. In
+ // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
+ // equivalent to an AArch64 instruction. There's the extra component of
+ // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
+ // only operated on 64/128bit vector types that have a direct mapping to a
+ // target register and so an exact mapping is implied.
+ // However, when using SVE for fixed length vectors, most legal vector types
+ // are actually sub-vectors of a larger SVE register. When mapping
+ // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
+ // how the mask's indices translate. Specifically, when the mapping requires
+ // an exact meaning for a specific vector index (e.g. Index X is the last
+ // vector element in the register) then such mappings are often only safe when
+ // the exact SVE register size is know. The main exception to this is when
+ // indices are logically relative to the first element of either
+ // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
+ // when converting from fixed-length to scalable vector types (i.e. the start
+ // of a fixed length vector is always the start of a scalable vector).
+ unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
+ unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
+ if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
+ if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
+ Op2.isUndef()) {
+ Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
+ return convertFromScalableVector(DAG, VT, Op);
+ }
+
+ if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
+ WhichResult != 0)
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
+
+ if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+ }
+
+ if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
+
+ if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+ unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+ return convertFromScalableVector(
+ DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
+ }
+ }
+
+ // Avoid producing TBL instruction if we don't know SVE register minimal size,
+ // unless NEON is not available and we can assume minimal SVE register size is
+ // 128-bits.
+ if (MinSVESize || !Subtarget->isNeonAvailable())
+ return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
+ DAG);
+
+ return SDValue();
+}
+
+SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT InVT = Op.getValueType();
+
+ assert(VT.isScalableVector() && isTypeLegal(VT) &&
+ InVT.isScalableVector() && isTypeLegal(InVT) &&
+ "Only expect to cast between legal scalable vector types!");
+ assert(VT.getVectorElementType() != MVT::i1 &&
+ InVT.getVectorElementType() != MVT::i1 &&
+ "For predicate bitcasts, use getSVEPredicateBitCast");
+
+ if (InVT == VT)
+ return Op;
+
+ EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
+ EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
+
+ // Safe bitcasting between unpacked vector types of different element counts
+ // is currently unsupported because the following is missing the necessary
+ // work to ensure the result's elements live where they're supposed to within
+ // an SVE register.
+ // 01234567
+ // e.g. nxv2i32 = XX??XX??
+ // nxv4f16 = X?X?X?X?
+ assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
+ VT == PackedVT || InVT == PackedInVT) &&
+ "Unexpected bitcast!");
+
+ // Pack input if required.
+ if (InVT != PackedInVT)
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
+
+ Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
+
+ // Unpack result if required.
+ if (VT != PackedVT)
+ Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
+
+ return Op;
+}
+
+bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
+ SDValue N) const {
+ return ::isAllActivePredicate(DAG, N);
+}
+
+EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
+ return ::getPromotedVTForPredicate(VT);
+}
+
+bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
+
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case AArch64ISD::VSHL: {
+ // Match (VSHL (VLSHR Val X) X)
+ SDValue ShiftL = Op;
+ SDValue ShiftR = Op->getOperand(0);
+ if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
+ return false;
+
+ if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
+ return false;
+
+ unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
+ unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
+
+ // Other cases can be handled as well, but this is not
+ // implemented.
+ if (ShiftRBits != ShiftLBits)
+ return false;
+
+ unsigned ScalarSize = Op.getScalarValueSizeInBits();
+ assert(ScalarSize > ShiftLBits && "Invalid shift imm");
+
+ APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
+ APInt UnusedBits = ~OriginalDemandedBits;
+
+ if ((ZeroBits & UnusedBits) != ZeroBits)
+ return false;
+
+ // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
+ // used - simplify to just Val.
+ return TLO.CombineTo(Op, ShiftR->getOperand(0));
+ }
+ case AArch64ISD::BICi: {
+ // Fold BICi if all destination bits already known to be zeroed
+ SDValue Op0 = Op.getOperand(0);
+ KnownBits KnownOp0 =
+ TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
+ // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
+ uint64_t BitsToClear = Op->getConstantOperandVal(1)
+ << Op->getConstantOperandVal(2);
+ APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
+ if (APInt(Known.getBitWidth(), BitsToClear)
+ .isSubsetOf(AlreadyZeroedBitsToClear))
+ return TLO.CombineTo(Op, Op0);
+
+ Known = KnownOp0 &
+ KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
+
+ return false;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ if (auto ElementSize = IsSVECntIntrinsic(Op)) {
+ unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
+ if (!MaxSVEVectorSizeInBits)
+ MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
+ unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
+ // The SVE count intrinsics don't support the multiplier immediate so we
+ // don't have to account for that here. The value returned may be slightly
+ // over the true required bits, as this is based on the "ALL" pattern. The
+ // other patterns are also exposed by these intrinsics, but they all
+ // return a value that's strictly less than "ALL".
+ unsigned RequiredBits = llvm::bit_width(MaxElements);
+ unsigned BitWidth = Known.Zero.getBitWidth();
+ if (RequiredBits < BitWidth)
+ Known.Zero.setHighBits(BitWidth - RequiredBits);
+ return false;
+ }
+ }
+ }
+
+ return TargetLowering::SimplifyDemandedBitsForTargetNode(
+ Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
+
+bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
+ return Op.getOpcode() == AArch64ISD::DUP ||
+ Op.getOpcode() == AArch64ISD::MOVI ||
+ (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
+ TargetLowering::isTargetCanonicalConstantNode(Op);
+}
+
+bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
+ return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
+ Subtarget->hasComplxNum();
+}
+
+bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation Operation, Type *Ty) const {
+ auto *VTy = dyn_cast<VectorType>(Ty);
+ if (!VTy)
+ return false;
+
+ // If the vector is scalable, SVE is enabled, implying support for complex
+ // numbers. Otherwise, we need to ensure complex number support is available
+ if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
+ return false;
+
+ auto *ScalarTy = VTy->getScalarType();
+ unsigned NumElements = VTy->getElementCount().getKnownMinValue();
+
+ // We can only process vectors that have a bit size of 128 or higher (with an
+ // additional 64 bits for Neon). Additionally, these vectors must have a
+ // power-of-2 size, as we later split them into the smallest supported size
+ // and merging them back together after applying complex operation.
+ unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
+ if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
+ !llvm::isPowerOf2_32(VTyWidth))
+ return false;
+
+ if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
+ unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
+ return 8 <= ScalarWidth && ScalarWidth <= 64;
+ }
+
+ return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
+ ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
+}
+
+Value *AArch64TargetLowering::createComplexDeinterleavingIR(
+ IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
+ ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+ Value *Accumulator) const {
+ VectorType *Ty = cast<VectorType>(InputA->getType());
+ bool IsScalable = Ty->isScalableTy();
+ bool IsInt = Ty->getElementType()->isIntegerTy();
+
+ unsigned TyWidth =
+ Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
+
+ assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
+ "Vector type must be either 64 or a power of 2 that is at least 128");
+
+ if (TyWidth > 128) {
+ int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+ auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
+ auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
+ auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
+ auto *UpperSplitA =
+ B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
+ auto *UpperSplitB =
+ B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
+ Value *LowerSplitAcc = nullptr;
+ Value *UpperSplitAcc = nullptr;
+ if (Accumulator) {
+ LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
+ UpperSplitAcc =
+ B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
+ }
+ auto *LowerSplitInt = createComplexDeinterleavingIR(
+ B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+ auto *UpperSplitInt = createComplexDeinterleavingIR(
+ B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+
+ auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
+ B.getInt64(0));
+ return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
+ }
+
+ if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+ if (Accumulator == nullptr)
+ Accumulator = Constant::getNullValue(Ty);
+
+ if (IsScalable) {
+ if (IsInt)
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_cmla_x, Ty,
+ {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+
+ auto *Mask = B.getAllOnesMask(Ty->getElementCount());
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_fcmla, Ty,
+ {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+ }
+
+ Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
+ Intrinsic::aarch64_neon_vcmla_rot90,
+ Intrinsic::aarch64_neon_vcmla_rot180,
+ Intrinsic::aarch64_neon_vcmla_rot270};
+
+
+ return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
+ {Accumulator, InputA, InputB});
+ }
+
+ if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+ if (IsScalable) {
+ if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+ Rotation == ComplexDeinterleavingRotation::Rotation_270) {
+ if (IsInt)
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_cadd_x, Ty,
+ {InputA, InputB, B.getInt32((int)Rotation * 90)});
+
+ auto *Mask = B.getAllOnesMask(Ty->getElementCount());
+ return B.CreateIntrinsic(
+ Intrinsic::aarch64_sve_fcadd, Ty,
+ {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
+ }
+ return nullptr;
+ }
+
+ Intrinsic::ID IntId = Intrinsic::not_intrinsic;
+ if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
+ IntId = Intrinsic::aarch64_neon_vcadd_rot90;
+ else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
+ IntId = Intrinsic::aarch64_neon_vcadd_rot270;
+
+ if (IntId == Intrinsic::not_intrinsic)
+ return nullptr;
+
+ return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
+ }
+
+ return nullptr;
+}
+
+bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
+ unsigned Opc = N->getOpcode();
+ if (ISD::isExtOpcode(Opc)) {
+ if (any_of(N->uses(),
+ [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
+ return false;
+ }
+ return true;
+}
+
+unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
+ return Subtarget->getMinimumJumpTableEntries();
+}
+
+MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ bool NonUnitFixedLengthVector =
+ VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+ if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+ EVT VT1;
+ MVT RegisterVT;
+ unsigned NumIntermediates;
+ getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
+ RegisterVT);
+ return RegisterVT;
+}
+
+unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
+ bool NonUnitFixedLengthVector =
+ VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
+ if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+ EVT VT1;
+ MVT VT2;
+ unsigned NumIntermediates;
+ return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
+ NumIntermediates, VT2);
+}
+
+unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const {
+ int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
+ Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+ if (!RegisterVT.isFixedLengthVector() ||
+ RegisterVT.getFixedSizeInBits() <= 128)
+ return NumRegs;
+
+ assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
+ assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
+ assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
+
+ // A size mismatch here implies either type promotion or widening and would
+ // have resulted in scalarisation if larger vectors had not be available.
+ if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
+ EVT EltTy = VT.getVectorElementType();
+ EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
+ if (!isTypeLegal(NewVT))
+ NewVT = EltTy;
+
+ IntermediateVT = NewVT;
+ NumIntermediates = VT.getVectorNumElements();
+ RegisterVT = getRegisterType(Context, NewVT);
+ return NumIntermediates;
+ }
+
+ // SVE VLS support does not introduce a new ABI so we should use NEON sized
+ // types for vector arguments and returns.
+
+ unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
+ NumIntermediates *= NumSubRegs;
+ NumRegs *= NumSubRegs;
+
+ switch (RegisterVT.getVectorElementType().SimpleTy) {
+ default:
+ llvm_unreachable("unexpected element type for vector");
+ case MVT::i8:
+ IntermediateVT = RegisterVT = MVT::v16i8;
+ break;
+ case MVT::i16:
+ IntermediateVT = RegisterVT = MVT::v8i16;
+ break;
+ case MVT::i32:
+ IntermediateVT = RegisterVT = MVT::v4i32;
+ break;
+ case MVT::i64:
+ IntermediateVT = RegisterVT = MVT::v2i64;
+ break;
+ case MVT::f16:
+ IntermediateVT = RegisterVT = MVT::v8f16;
+ break;
+ case MVT::f32:
+ IntermediateVT = RegisterVT = MVT::v4f32;
+ break;
+ case MVT::f64:
+ IntermediateVT = RegisterVT = MVT::v2f64;
+ break;
+ case MVT::bf16:
+ IntermediateVT = RegisterVT = MVT::v8bf16;
+ break;
+ }
+
+ return NumRegs;
+}
+
+bool AArch64TargetLowering::hasInlineStackProbe(
+ const MachineFunction &MF) const {
+ return !Subtarget->isTargetWindows() &&
+ MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
+}
+
+#ifndef NDEBUG
+void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case AArch64ISD::SUNPKLO:
+ case AArch64ISD::SUNPKHI:
+ case AArch64ISD::UUNPKLO:
+ case AArch64ISD::UUNPKHI: {
+ assert(N->getNumValues() == 1 && "Expected one result!");
+ assert(N->getNumOperands() == 1 && "Expected one operand!");
+ EVT VT = N->getValueType(0);
+ EVT OpVT = N->getOperand(0).getValueType();
+ assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
+ VT.isInteger() && "Expected integer vectors!");
+ assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
+ "Expected vectors of equal size!");
+ // TODO: Enable assert once bogus creations have been fixed.
+ // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
+ // "Expected result vector with half the lanes of its input!");
+ break;
+ }
+ case AArch64ISD::TRN1:
+ case AArch64ISD::TRN2:
+ case AArch64ISD::UZP1:
+ case AArch64ISD::UZP2:
+ case AArch64ISD::ZIP1:
+ case AArch64ISD::ZIP2: {
+ assert(N->getNumValues() == 1 && "Expected one result!");
+ assert(N->getNumOperands() == 2 && "Expected two operands!");
+ EVT VT = N->getValueType(0);
+ EVT Op0VT = N->getOperand(0).getValueType();
+ EVT Op1VT = N->getOperand(1).getValueType();
+ assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
+ "Expected vectors!");
+ // TODO: Enable assert once bogus creations have been fixed.
+ // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
+ break;
+ }
+ }
+}
+#endif
>From e88bcd8a81b7a7eb28fc395b128c68755935ef03 Mon Sep 17 00:00:00 2001
From: AtariDreams <gfunni234 at gmail.com>
Date: Sat, 29 Jun 2024 10:23:06 -0400
Subject: [PATCH 5/5] Update AArch64ISelLowering.cpp
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5c6453ed81f2b..47bf0e0cbd63b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3902,8 +3902,12 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
!isLegalArithImmed(-RHS->getAsZExtVal()))) {
SDValue TheLHS =
isCMN(LHS, LHS.getOperand(1), CC, DAG) ? LHS.getOperand(1) : LHS;
- SDValue TheRHS = !isa<ConstantSDNode>(RHS) && isCMN(RHS, RHS.getOperand(1), CC, DAG) ? RHS.getOperand(1) : RHS;
- if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(TheRHS)) {
+ SDValue TheRHS =
+ !isa<ConstantSDNode>(RHS) && isCMN(RHS, RHS.getOperand(1), CC, DAG)
+ ? RHS.getOperand(1)
+ : RHS;
+ if (getCmpOperandFoldingProfit(TheLHS) >
+ getCmpOperandFoldingProfit(TheRHS)) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
}
More information about the llvm-commits
mailing list