[llvm] Not-quite-working prototype with ISD node (PR #118810)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 6 07:59:10 PST 2024
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/118810
>From c754687266d353e65f86bde2291a51211d29428b Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 3 Dec 2024 14:52:40 +0000
Subject: [PATCH] [SelectionDAG] Add an ISD node for vector.extract.last.active
Since we shouldn't be changing lowering in SelectionDAGBuilder based on
the target, introduce a new ISD node for extract.last.active and
perform the current lowering in LegalizeVectorOps.
This results in worse codegen for now, but it's easy for a target to
match a single ISD node and improve the output.
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 24 ++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 +
.../SelectionDAG/LegalizeTypesGeneric.cpp | 61 +++++
.../SelectionDAG/LegalizeVectorOps.cpp | 80 ++++++
.../SelectionDAG/SelectionDAGBuilder.cpp | 37 +--
.../SelectionDAG/SelectionDAGDumper.cpp | 3 +
llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 +
.../Target/AArch64/AArch64ISelLowering.cpp | 10 +
.../AArch64/vector-extract-last-active.ll | 240 ++++++++++--------
.../RISCV/rvv/vector-extract-last-active.ll | 165 +++++++-----
11 files changed, 432 insertions(+), 199 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 0b6d155b6d161e..f0a8ccc41eb3d3 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1480,6 +1480,10 @@ enum NodeType {
// Output: Output Chain
EXPERIMENTAL_VECTOR_HISTOGRAM,
+ // experimental.vector.extract.last.active intrinsic
+ // Operands: Data, Mask, PassThru
+ VECTOR_EXTRACT_LAST_ACTIVE,
+
// llvm.clear_cache intrinsic
// Operands: Input Chain, Start Addres, End Address
// Outputs: Output Chain
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 986d69e6c7a9e0..e4ca5486efd5a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -155,6 +155,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Res = PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(N);
+ break;
+
case ISD::SIGN_EXTEND:
case ISD::VP_SIGN_EXTEND:
case ISD::ZERO_EXTEND:
@@ -2069,6 +2073,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo);
break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Res = PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(N, OpNo);
+ break;
}
// If the result is null, the sub-method took care of registering results etc.
@@ -2803,6 +2810,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N,
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue
+DAGTypeLegalizer::PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
+ unsigned OpNo) {
+ SmallVector<SDValue, 3> NewOps(N->ops());
+ NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
//===----------------------------------------------------------------------===//
// Integer Result Expansion
//===----------------------------------------------------------------------===//
@@ -2840,6 +2855,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(N, Lo, Hi);
+ break;
case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break;
case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
@@ -6102,6 +6120,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
}
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ return DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, SDLoc(N), NVT, N->ops());
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
EVT OutVT = N->getValueType(0);
EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1703149aca7463..ef3cd66df25363 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,6 +378,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
+ SDValue PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -428,6 +429,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, unsigned OpNo);
void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -1214,6 +1216,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, SDValue &Lo,
+ SDValue &Hi);
void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 2655e8428309da..d297eca620a094 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -19,6 +19,7 @@
//===----------------------------------------------------------------------===//
#include "LegalizeTypes.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
using namespace llvm;
@@ -244,6 +245,66 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
std::swap(Lo, Hi);
}
+void DAGTypeLegalizer::ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
+ SDValue &Lo,
+ SDValue &Hi) {
+ SDValue Data = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+ SDValue PassThru = N->getOperand(2);
+
+ ElementCount OldEltCount = Data.getValueType().getVectorElementCount();
+ EVT OldEltVT = Data.getValueType().getVectorElementType();
+ SDLoc dl(N);
+
+ EVT OldVT = N->getValueType(0);
+ EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+ if (OldVT != OldEltVT) {
+ // The result of EXTRACT_LAST_ACTIVE may be larger than the element type of
+ // the input vector. If so, extend the elements of the input vector to the
+ // same bitwidth as the result before expanding.
+ assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+ EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount);
+ Data = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+ }
+
+ SDValue NewVec = DAG.getNode(
+ ISD::BITCAST, dl,
+ EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), Data);
+
+ auto [DataLo, DataHi] = DAG.SplitVector(NewVec, dl);
+ auto [PassLo, PassHi] = DAG.SplitScalar(PassThru, dl, NewVT, NewVT);
+
+ EVT SplitVT = DataLo.getValueType();
+
+ // TODO: I *think* this works correctly, but I haven't confirmed it yet by
+ // actually running a compiled program with example data.
+ //
+ // We want the matching lo and hi parts from whichever lane was the last
+ // active.
+ SDValue Deinterleaved;
+ if (SplitVT.isFixedLengthVector()) {
+ unsigned SplitNum = SplitVT.getVectorMinNumElements();
+ SDValue Even = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
+ createStrideMask(0, 2, SplitNum));
+ SDValue Odd = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
+ createStrideMask(1, 2, SplitNum));
+ Deinterleaved = DAG.getMergeValues({Even, Odd}, dl);
+ } else
+ Deinterleaved =
+ DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl,
+ DAG.getVTList(SplitVT, SplitVT), DataLo, DataHi);
+
+ Lo = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
+ Deinterleaved.getValue(0), Mask, PassLo);
+ Hi = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
+ Deinterleaved.getValue(1), Mask, PassHi);
+
+ // FIXME: Endianness?
+ assert(!DAG.getDataLayout().isBigEndian() &&
+ "Implement big endian result expansion for extract_last_active");
+}
+
void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index db21e708970648..383d26bb8b8248 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -138,6 +139,7 @@ class VectorLegalizer {
SDValue ExpandVP_FNEG(SDNode *Node);
SDValue ExpandVP_FABS(SDNode *Node);
SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
+ SDValue ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
@@ -465,6 +467,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECTOR_COMPRESS:
case ISD::SCMP:
case ISD::UCMP:
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:
@@ -1202,6 +1205,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VECTOR_COMPRESS:
Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
return;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Results.push_back(ExpandVECTOR_EXTRACT_LAST_ACTIVE(Node));
+ return;
case ISD::SCMP:
case ISD::UCMP:
Results.push_back(TLI.expandCMP(Node, DAG));
@@ -1713,6 +1719,80 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
}
+SDValue VectorLegalizer::ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node) {
+ SDLoc DL(Node);
+ SDValue Data = Node->getOperand(0);
+ SDValue Mask = Node->getOperand(1);
+ SDValue PassThru = Node->getOperand(2);
+
+ EVT DataVT = Data.getValueType();
+ EVT ScalarVT = PassThru.getValueType();
+ EVT BoolVT = Mask.getValueType().getScalarType();
+
+ // Find a suitable type for a stepvector.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ if (DataVT.isScalableVector())
+ VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned EltWidth = TLI.getBitWidthForCttzElements(
+ ScalarVT.getTypeForEVT(*DAG.getContext()), DataVT.getVectorElementCount(),
+ /*ZeroIsPoison=*/true, &VScaleRange);
+
+ // HACK: If the target selects a VT that's too wide based on the legal types
+ // for a vecreduce_umax, if will force expansion of the node -- which
+ // doesn't work on scalable vectors...
+ // Is there another method we could use to get a smaller VT instead
+ // of just capping to 32b?
+ EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u));
+ EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+ // HACK: If the target selects a VT that's too small to form a legal vector
+ // type, we also run into problems trying to expand the vecreduce_umax.
+ //
+ // I think perhaps we need to revisit how getBitWidthForCttzElements
+ // works...
+ if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
+ TargetLowering::TypePromoteInteger) {
+ StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ StepVT = StepVecVT.getVectorElementType();
+ }
+
+ // Zero out lanes with inactive elements, then find the highest remaining
+ // value from the stepvector.
+ SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
+ SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
+ SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
+
+ // HACK: Unfortunately, LegalizeVectorOps does not recursively legalize *all*
+ // added nodes, just the end result nodes until it finds legal ops.
+ // LegalizeDAG doesn't handle VSELECT at all presently. So if we need to
+ // legalize a vselect then we have to do it here.
+ //
+ // We might want to change LegalizeVectorOps to walk backwards through the
+ // nodes like LegalizeDAG? And share VSELECT legalization code with
+ // LegalizeDAG?
+ //
+ // Or would that cause problems with illegal types that we might have just
+ // introduced?
+ //
+ // Having a legal op with illegal types marked as Legal should work, with the
+ // expectation being that type legalization fixes it up later.
+ if (TLI.getOperationAction(ISD::VSELECT, StepVecVT) == TargetLowering::Expand)
+ ActiveElts = LegalizeOp(ActiveElts);
+
+ SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
+
+ // Extract the corresponding lane from the data vector
+ EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, DL, ExtVT);
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Data, Idx);
+
+ // If all mask lanes were inactive, choose the passthru value instead.
+ SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, DL, BoolVT, Mask);
+ return DAG.getSelect(DL, ScalarVT, AnyActive, Extract, PassThru);
+}
+
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b72c5eff22f183..61ffce14ce96a1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6431,43 +6431,18 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
unsigned Intrinsic) {
assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
"Tried lowering invalid vector extract last");
+
SDLoc sdl = getCurSDLoc();
SDValue Data = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
SDValue PassThru = getValue(I.getOperand(2));
- EVT DataVT = Data.getValueType();
- EVT ScalarVT = PassThru.getValueType();
- EVT BoolVT = Mask.getValueType().getScalarType();
-
- // Find a suitable type for a stepvector.
- ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
- if (DataVT.isScalableVector())
- VScaleRange = getVScaleRange(I.getCaller(), 64);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned EltWidth = TLI.getBitWidthForCttzElements(
- I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
- &VScaleRange);
- MVT StepVT = MVT::getIntegerVT(EltWidth);
- EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
- // Zero out lanes with inactive elements, then find the highest remaining
- // value from the stepvector.
- SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
- SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
- SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
- SDValue HighestIdx =
- DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
-
- // Extract the corresponding lane from the data vector
- EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
- SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
- SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
-
- // If all mask lanes were inactive, choose the passthru value instead.
- SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
- SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+ EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+ SDValue Result = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, sdl, ResultVT,
+ Data, Mask, PassThru);
+
setValue(&I, Result);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 580ff19065557b..42cbb721703d99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,6 +567,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return "histogram";
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ return "extract_last_active";
+
// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
case ISD::SDID: \
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 392cfbdd21273d..5ea39124a8e55a 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -818,6 +818,9 @@ void TargetLoweringBase::initActions() {
setOperationAction(ISD::SDOPC, VT, Expand);
#include "llvm/IR/VPIntrinsics.def"
+ // Masked vector extracts default to expand.
+ setOperationAction(ISD::VECTOR_EXTRACT_LAST_ACTIVE, VT, Expand);
+
// FP environment operations default to expand.
setOperationAction(ISD::GET_FPENV, VT, Expand);
setOperationAction(ISD::SET_FPENV, VT, Expand);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d1354ccf376609..85db501a61989d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,6 +401,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
}
+ // TODO: Should we include any other operations here? The calls to
+ // addDRType/addQRType below do mark VSELECT as Expand for the
+ // specified VTs, but leave other illegal types as the default
+ // of 'Legal'. LegalizeDAG doesn't legalize VSELECT after type
+ // legalization if LegalizeVectorOps introduces one.
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index c0f1720e1cf8b3..a0e9c6607042f6 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -7,21 +7,26 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; NEON-FIXED: // %bb.0:
; NEON-FIXED-NEXT: sub sp, sp, #16
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0
+; NEON-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
+; NEON-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
-; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
-; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: mov x11, sp
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; NEON-FIXED-NEXT: umaxv b1, v1.16b
-; NEON-FIXED-NEXT: umaxv b2, v2.16b
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
-; NEON-FIXED-NEXT: ldrb w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
+; NEON-FIXED-NEXT: fmov w10, s1
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfxil x11, x10, #0, #4
+; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; NEON-FIXED-NEXT: lsr x9, x8, #16
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: ldrb w9, [x11]
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; NEON-FIXED-NEXT: tst w8, #0xff
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -29,20 +34,25 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; SVE-FIXED: // %bb.0:
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: index z2.b, #0, #1
-; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0
-; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
+; SVE-FIXED-NEXT: index z4.b, #0, #1
+; SVE-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
; SVE-FIXED-NEXT: umaxv b1, v1.16b
-; SVE-FIXED-NEXT: umaxv b2, v2.16b
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
-; SVE-FIXED-NEXT: ldrb w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: fmov w10, s1
+; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; SVE-FIXED-NEXT: bfxil x11, x10, #0, #4
+; SVE-FIXED-NEXT: lsr x9, x8, #16
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: ldrb w9, [x11]
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; SVE-FIXED-NEXT: tst w8, #0xff
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <16 x i8> %mask, zeroinitializer
@@ -57,19 +67,22 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: mov x11, sp
; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0]
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.8b, v1.8h
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxv b1, v1.8b
-; NEON-FIXED-NEXT: umaxv b2, v2.8b
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
-; NEON-FIXED-NEXT: ldrh w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: umaxv b1, v2.8b
+; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; NEON-FIXED-NEXT: lsr x9, x8, #16
+; NEON-FIXED-NEXT: fmov w10, s1
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; NEON-FIXED-NEXT: bfi x11, x10, #1, #3
+; NEON-FIXED-NEXT: tst w8, #0xff
+; NEON-FIXED-NEXT: ldrh w9, [x11]
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -79,18 +92,21 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
; SVE-FIXED-NEXT: index z2.b, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.8b, v1.8h
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxv b1, v1.8b
-; SVE-FIXED-NEXT: umaxv b2, v2.8b
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
-; SVE-FIXED-NEXT: ldrh w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: umaxv b1, v2.8b
+; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; SVE-FIXED-NEXT: lsr x9, x8, #16
+; SVE-FIXED-NEXT: fmov w10, s1
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; SVE-FIXED-NEXT: bfi x11, x10, #1, #3
+; SVE-FIXED-NEXT: tst w8, #0xff
+; SVE-FIXED-NEXT: ldrh w9, [x11]
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <8 x i16> %mask, zeroinitializer
@@ -105,19 +121,21 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: mov x11, sp
; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0]
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxv h2, v2.4h
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: ldr w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s2
+; NEON-FIXED-NEXT: tst w8, #0xffff
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr w9, [x11]
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -127,18 +145,20 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; SVE-FIXED-NEXT: index z2.h, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxv h2, v2.4h
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: ldr w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s2
+; SVE-FIXED-NEXT: tst w8, #0xffff
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr w9, [x11]
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <4 x i32> %mask, zeroinitializer
@@ -153,19 +173,20 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI3_0
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: mov x10, sp
; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0]
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
-; NEON-FIXED-NEXT: ldr x8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel x0, x8, x0, ne
+; NEON-FIXED-NEXT: fmov w9, s2
+; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: ldr x10, [x10]
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: cmp w8, #0
+; NEON-FIXED-NEXT: csel x0, x10, x0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -175,18 +196,19 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; SVE-FIXED-NEXT: index z2.s, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov x10, sp
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
-; SVE-FIXED-NEXT: ldr x8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel x0, x8, x0, ne
+; SVE-FIXED-NEXT: fmov w9, s2
+; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: ldr x10, [x10]
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: cmp w8, #0
+; SVE-FIXED-NEXT: csel x0, x10, x0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <2 x i64> %mask, zeroinitializer
@@ -201,18 +223,20 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI4_0
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: mov x11, sp
; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0]
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxv h3, v3.4h
-; NEON-FIXED-NEXT: fmov w8, s3
-; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: fmov w8, s1
-; NEON-FIXED-NEXT: ldr s0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s3
+; NEON-FIXED-NEXT: tst w8, #0xffff
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr s0, [x11]
; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -223,17 +247,19 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; SVE-FIXED-NEXT: index z3.h, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxv h3, v3.4h
-; SVE-FIXED-NEXT: fmov w8, s3
-; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: fmov w8, s1
-; SVE-FIXED-NEXT: ldr s0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s3
+; SVE-FIXED-NEXT: tst w8, #0xffff
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr s0, [x11]
; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -249,18 +275,19 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI5_0
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: mov x10, sp
; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; NEON-FIXED-NEXT: fmov w8, s3
-; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
-; NEON-FIXED-NEXT: fmov w8, s1
-; NEON-FIXED-NEXT: ldr d0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: fmov w9, s3
+; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: ldr d0, [x10]
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: cmp w8, #0
; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -271,17 +298,18 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; SVE-FIXED-NEXT: index z3.s, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: mov x10, sp
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; SVE-FIXED-NEXT: fmov w8, s3
-; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
-; SVE-FIXED-NEXT: fmov w8, s1
-; SVE-FIXED-NEXT: ldr d0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: fmov w9, s3
+; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: ldr d0, [x10]
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: cmp w8, #0
; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -318,7 +346,7 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
; CHECK-NEXT: sel z1.h, p0, z1.h, z2.h
; CHECK-NEXT: umaxv h1, p1, z1.h
; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: and x8, x8, #0xffff
; CHECK-NEXT: whilels p2.h, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb w8, p2, z0.h
@@ -337,7 +365,7 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s
; CHECK-NEXT: umaxv s1, p1, z1.s
; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: whilels p2.s, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb w8, p2, z0.s
@@ -356,7 +384,6 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
; CHECK-NEXT: umaxv d1, p1, z1.d
; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: and x8, x8, #0xff
; CHECK-NEXT: whilels p2.d, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb x8, p2, z0.d
@@ -375,7 +402,7 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
; CHECK-NEXT: sel z2.s, p0, z2.s, z3.s
; CHECK-NEXT: umaxv s2, p1, z2.s
; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: whilels p2.s, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb s0, p2, z0.s
@@ -394,7 +421,6 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
; CHECK-NEXT: sel z2.d, p0, z2.d, z3.d
; CHECK-NEXT: umaxv d2, p1, z2.d
; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: and x8, x8, #0xff
; CHECK-NEXT: whilels p2.d, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb d0, p2, z0.d
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
index 1eef183db21bb3..81ff400b38cb47 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
@@ -76,23 +76,31 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vmsne.vi v0, v9, 0
+; RV32-NEXT: vmsne.vi v9, v9, 0
+; RV32-NEXT: vmv.v.i v0, 1
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv1r.v v11, v10
+; RV32-NEXT: vcpop.m a2, v9
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32-NEXT: vrgather.vi v11, v8, 1, v0.t
+; RV32-NEXT: vmv1r.v v0, v9
; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vcpop.m a2, v0
-; RV32-NEXT: vid.v v9, v0.t
+; RV32-NEXT: vid.v v12, v0.t
; RV32-NEXT: beqz a2, .LBB3_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v9, v9, v9
-; RV32-NEXT: li a1, 32
+; RV32-NEXT: vredmaxu.vs v9, v12, v12
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vslideup.vi v8, v10, 1
+; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
; RV32-NEXT: vmv.x.s a0, v9
; RV32-NEXT: andi a0, a0, 255
-; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vslidedown.vx v9, v11, a0
; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: .LBB3_2:
; RV32-NEXT: ret
;
@@ -168,22 +176,39 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
}
define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) {
-; CHECK-LABEL: extract_last_i8_scalable:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vcpop.m a1, v0
-; CHECK-NEXT: vid.v v10, v0.t
-; CHECK-NEXT: beqz a1, .LBB6_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vredmaxu.vs v10, v10, v10
-; CHECK-NEXT: vmv.x.s a0, v10
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: .LBB6_2:
-; CHECK-NEXT: ret
+; RV32-LABEL: extract_last_i8_scalable:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV32-NEXT: vmv.v.i v16, 0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vid.v v16, v0.t
+; RV32-NEXT: beqz a1, .LBB6_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: vredmaxu.vs v10, v16, v16
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: .LBB6_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: extract_last_i8_scalable:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV64-NEXT: vmv.v.i v16, 0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vid.v v16, v0.t
+; RV64-NEXT: beqz a1, .LBB6_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: vredmaxu.vs v10, v16, v16
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v8, a0
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: .LBB6_2:
+; RV64-NEXT: ret
%res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
ret i8 %res
}
@@ -191,16 +216,14 @@ define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1>
define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) {
; RV32-LABEL: extract_last_i16_scalable:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV32-NEXT: vmv.v.i v12, 0
; RV32-NEXT: vcpop.m a1, v0
-; RV32-NEXT: vid.v v10, v0.t
+; RV32-NEXT: vid.v v12, v0.t
; RV32-NEXT: beqz a1, .LBB7_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v10, v10
+; RV32-NEXT: vredmaxu.vs v10, v12, v12
; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vslidedown.vx v8, v8, a0
; RV32-NEXT: vmv.x.s a0, v8
@@ -209,16 +232,16 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
;
; RV64-LABEL: extract_last_i16_scalable:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV64-NEXT: vmv.v.i v12, 0
; RV64-NEXT: vcpop.m a1, v0
-; RV64-NEXT: vid.v v10, v0.t
+; RV64-NEXT: vid.v v12, v0.t
; RV64-NEXT: beqz a1, .LBB7_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v10, v10
+; RV64-NEXT: vredmaxu.vs v10, v12, v12
; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 48
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV64-NEXT: vslidedown.vx v8, v8, a0
; RV64-NEXT: vmv.x.s a0, v8
@@ -269,27 +292,27 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64_scalable:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, mu
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
; RV32-NEXT: vmv.v.i v10, 0
; RV32-NEXT: vcpop.m a2, v0
; RV32-NEXT: vid.v v10, v0.t
; RV32-NEXT: beqz a2, .LBB9_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: li a1, 32
; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vnsrl.wi v10, v8, 0
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vnsrl.wx v11, v8, a1
+; RV32-NEXT: vslidedown.vx v8, v10, a0
+; RV32-NEXT: vslidedown.vx v9, v11, a0
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: .LBB9_2:
; RV32-NEXT: ret
;
; RV64-LABEL: extract_last_i64_scalable:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu
; RV64-NEXT: vmv.v.i v10, 0
; RV64-NEXT: vcpop.m a1, v0
; RV64-NEXT: vid.v v10, v0.t
@@ -297,6 +320,8 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
; RV64-NEXT: # %bb.1:
; RV64-NEXT: vredmaxu.vs v10, v10, v10
; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; RV64-NEXT: vslidedown.vx v8, v8, a0
; RV64-NEXT: vmv.x.s a0, v8
@@ -345,21 +370,39 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
}
define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) {
-; CHECK-LABEL: extract_last_double_scalable:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vcpop.m a0, v0
-; CHECK-NEXT: vid.v v10, v0.t
-; CHECK-NEXT: beqz a0, .LBB11_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vredmaxu.vs v10, v10, v10
-; CHECK-NEXT: vmv.x.s a0, v10
-; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
-; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB11_2:
-; CHECK-NEXT: ret
+; RV32-LABEL: extract_last_double_scalable:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vcpop.m a0, v0
+; RV32-NEXT: vid.v v10, v0.t
+; RV32-NEXT: beqz a0, .LBB11_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: vredmaxu.vs v10, v10, v10
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: .LBB11_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: extract_last_double_scalable:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vcpop.m a0, v0
+; RV64-NEXT: vid.v v10, v0.t
+; RV64-NEXT: beqz a0, .LBB11_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: vredmaxu.vs v10, v10, v10
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v8, a0
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: .LBB11_2:
+; RV64-NEXT: ret
%res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
ret double %res
}
More information about the llvm-commits
mailing list