[llvm] [SelectionDAG] Add an ISD node for vector.extract.last.active (PR #118810)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 13 08:00:38 PST 2025
https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/118810
>From 456f07945118b7bff27f4733415c440d5e3566a0 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Tue, 3 Dec 2024 14:52:40 +0000
Subject: [PATCH 1/3] [SelectionDAG] Add an ISD node for
vector.extract.last.active
Since we shouldn't be changing lowering in SelectionDAGBuilder based on
the target, introduce a new ISD node for extract.last.active and
perform the current lowering in LegalizeVectorOps.
This results in worse codegen for now, but it's easy for a target to
match a single ISD node and improve the output.
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 24 ++
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 4 +
.../SelectionDAG/LegalizeTypesGeneric.cpp | 61 ++++
.../SelectionDAG/LegalizeVectorOps.cpp | 80 +++++
.../SelectionDAG/SelectionDAGBuilder.cpp | 37 +--
.../SelectionDAG/SelectionDAGDumper.cpp | 3 +
llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 +
.../Target/AArch64/AArch64ISelLowering.cpp | 10 +
.../AArch64/vector-extract-last-active.ll | 300 ++++++++++--------
.../RISCV/rvv/vector-extract-last-active.ll | 165 ++++++----
11 files changed, 462 insertions(+), 229 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 604dc9419025b0..d2ed8ec2e74663 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1480,6 +1480,10 @@ enum NodeType {
// Output: Output Chain
EXPERIMENTAL_VECTOR_HISTOGRAM,
+ // experimental.vector.extract.last.active intrinsic
+ // Operands: Data, Mask, PassThru
+ VECTOR_EXTRACT_LAST_ACTIVE,
+
// llvm.clear_cache intrinsic
// Operands: Input Chain, Start Addres, End Address
// Outputs: Output Chain
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index be7521f3416850..6d75f0788203f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -155,6 +155,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Res = PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(N);
+ break;
+
case ISD::SIGN_EXTEND:
case ISD::VP_SIGN_EXTEND:
case ISD::ZERO_EXTEND:
@@ -2069,6 +2073,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo);
break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Res = PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(N, OpNo);
+ break;
}
// If the result is null, the sub-method took care of registering results etc.
@@ -2810,6 +2817,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N,
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
+SDValue
+DAGTypeLegalizer::PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
+ unsigned OpNo) {
+ SmallVector<SDValue, 3> NewOps(N->ops());
+ NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
//===----------------------------------------------------------------------===//
// Integer Result Expansion
//===----------------------------------------------------------------------===//
@@ -2848,6 +2863,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(N, Lo, Hi);
+ break;
case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break;
case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
@@ -6124,6 +6142,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
}
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N) {
+ EVT VT = N->getValueType(0);
+ EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ return DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, SDLoc(N), NVT, N->ops());
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
EVT OutVT = N->getValueType(0);
EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 571a710cc92a34..0fc51c33d5f181 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,6 +378,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
+ SDValue PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -428,6 +429,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, unsigned OpNo);
void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -1215,6 +1217,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, SDValue &Lo,
+ SDValue &Hi);
void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 113a3bc0bbea69..f7d4800487d609 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -19,6 +19,7 @@
//===----------------------------------------------------------------------===//
#include "LegalizeTypes.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
using namespace llvm;
@@ -244,6 +245,66 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
std::swap(Lo, Hi);
}
+void DAGTypeLegalizer::ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
+ SDValue &Lo,
+ SDValue &Hi) {
+ SDValue Data = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+ SDValue PassThru = N->getOperand(2);
+
+ ElementCount OldEltCount = Data.getValueType().getVectorElementCount();
+ EVT OldEltVT = Data.getValueType().getVectorElementType();
+ SDLoc dl(N);
+
+ EVT OldVT = N->getValueType(0);
+ EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+ if (OldVT != OldEltVT) {
+ // The result of EXTRACT_LAST_ACTIVE may be larger than the element type of
+ // the input vector. If so, extend the elements of the input vector to the
+ // same bitwidth as the result before expanding.
+ assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+ EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount);
+ Data = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+ }
+
+ SDValue NewVec = DAG.getNode(
+ ISD::BITCAST, dl,
+ EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), Data);
+
+ auto [DataLo, DataHi] = DAG.SplitVector(NewVec, dl);
+ auto [PassLo, PassHi] = DAG.SplitScalar(PassThru, dl, NewVT, NewVT);
+
+ EVT SplitVT = DataLo.getValueType();
+
+ // TODO: I *think* this works correctly, but I haven't confirmed it yet by
+ // actually running a compiled program with example data.
+ //
+ // We want the matching lo and hi parts from whichever lane was the last
+ // active.
+ SDValue Deinterleaved;
+ if (SplitVT.isFixedLengthVector()) {
+ unsigned SplitNum = SplitVT.getVectorMinNumElements();
+ SDValue Even = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
+ createStrideMask(0, 2, SplitNum));
+ SDValue Odd = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
+ createStrideMask(1, 2, SplitNum));
+ Deinterleaved = DAG.getMergeValues({Even, Odd}, dl);
+ } else
+ Deinterleaved =
+ DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl,
+ DAG.getVTList(SplitVT, SplitVT), DataLo, DataHi);
+
+ Lo = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
+ Deinterleaved.getValue(0), Mask, PassLo);
+ Hi = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
+ Deinterleaved.getValue(1), Mask, PassHi);
+
+ // FIXME: Endianness?
+ assert(!DAG.getDataLayout().isBigEndian() &&
+ "Implement big endian result expansion for extract_last_active");
+}
+
void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index e8404a13009a72..8026e5c845b418 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -29,6 +29,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -138,6 +139,7 @@ class VectorLegalizer {
SDValue ExpandVP_FNEG(SDNode *Node);
SDValue ExpandVP_FABS(SDNode *Node);
SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
+ SDValue ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
@@ -467,6 +469,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECTOR_COMPRESS:
case ISD::SCMP:
case ISD::UCMP:
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:
@@ -1208,6 +1211,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VECTOR_COMPRESS:
Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
return;
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ Results.push_back(ExpandVECTOR_EXTRACT_LAST_ACTIVE(Node));
+ return;
case ISD::SCMP:
case ISD::UCMP:
Results.push_back(TLI.expandCMP(Node, DAG));
@@ -1719,6 +1725,80 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
}
+SDValue VectorLegalizer::ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node) {
+ SDLoc DL(Node);
+ SDValue Data = Node->getOperand(0);
+ SDValue Mask = Node->getOperand(1);
+ SDValue PassThru = Node->getOperand(2);
+
+ EVT DataVT = Data.getValueType();
+ EVT ScalarVT = PassThru.getValueType();
+ EVT BoolVT = Mask.getValueType().getScalarType();
+
+ // Find a suitable type for a stepvector.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ if (DataVT.isScalableVector())
+ VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned EltWidth = TLI.getBitWidthForCttzElements(
+ ScalarVT.getTypeForEVT(*DAG.getContext()), DataVT.getVectorElementCount(),
+ /*ZeroIsPoison=*/true, &VScaleRange);
+
+ // HACK: If the target selects a VT that's too wide based on the legal types
+ // for a vecreduce_umax, if will force expansion of the node -- which
+ // doesn't work on scalable vectors...
+ // Is there another method we could use to get a smaller VT instead
+ // of just capping to 32b?
+ EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u));
+ EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
+
+ // HACK: If the target selects a VT that's too small to form a legal vector
+ // type, we also run into problems trying to expand the vecreduce_umax.
+ //
+ // I think perhaps we need to revisit how getBitWidthForCttzElements
+ // works...
+ if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
+ TargetLowering::TypePromoteInteger) {
+ StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ StepVT = StepVecVT.getVectorElementType();
+ }
+
+ // Zero out lanes with inactive elements, then find the highest remaining
+ // value from the stepvector.
+ SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
+ SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
+ SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
+
+ // HACK: Unfortunately, LegalizeVectorOps does not recursively legalize *all*
+ // added nodes, just the end result nodes until it finds legal ops.
+ // LegalizeDAG doesn't handle VSELECT at all presently. So if we need to
+ // legalize a vselect then we have to do it here.
+ //
+ // We might want to change LegalizeVectorOps to walk backwards through the
+ // nodes like LegalizeDAG? And share VSELECT legalization code with
+ // LegalizeDAG?
+ //
+ // Or would that cause problems with illegal types that we might have just
+ // introduced?
+ //
+ // Having a legal op with illegal types marked as Legal should work, with the
+ // expectation being that type legalization fixes it up later.
+ if (TLI.getOperationAction(ISD::VSELECT, StepVecVT) == TargetLowering::Expand)
+ ActiveElts = LegalizeOp(ActiveElts);
+
+ SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
+
+ // Extract the corresponding lane from the data vector
+ EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, DL, ExtVT);
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Data, Idx);
+
+ // If all mask lanes were inactive, choose the passthru value instead.
+ SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, DL, BoolVT, Mask);
+ return DAG.getSelect(DL, ScalarVT, AnyActive, Extract, PassThru);
+}
+
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f8d7c3ef7bbe71..d1c644b0647189 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6426,43 +6426,18 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
unsigned Intrinsic) {
assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
"Tried lowering invalid vector extract last");
+
SDLoc sdl = getCurSDLoc();
SDValue Data = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
SDValue PassThru = getValue(I.getOperand(2));
- EVT DataVT = Data.getValueType();
- EVT ScalarVT = PassThru.getValueType();
- EVT BoolVT = Mask.getValueType().getScalarType();
-
- // Find a suitable type for a stepvector.
- ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
- if (DataVT.isScalableVector())
- VScaleRange = getVScaleRange(I.getCaller(), 64);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned EltWidth = TLI.getBitWidthForCttzElements(
- I.getType(), DataVT.getVectorElementCount(), /*ZeroIsPoison=*/true,
- &VScaleRange);
- MVT StepVT = MVT::getIntegerVT(EltWidth);
- EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
- // Zero out lanes with inactive elements, then find the highest remaining
- // value from the stepvector.
- SDValue Zeroes = DAG.getConstant(0, sdl, StepVecVT);
- SDValue StepVec = DAG.getStepVector(sdl, StepVecVT);
- SDValue ActiveElts = DAG.getSelect(sdl, StepVecVT, Mask, StepVec, Zeroes);
- SDValue HighestIdx =
- DAG.getNode(ISD::VECREDUCE_UMAX, sdl, StepVT, ActiveElts);
-
- // Extract the corresponding lane from the data vector
- EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
- SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, sdl, ExtVT);
- SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ScalarVT, Data, Idx);
-
- // If all mask lanes were inactive, choose the passthru value instead.
- SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
- SDValue Result = DAG.getSelect(sdl, ScalarVT, AnyActive, Extract, PassThru);
+ EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+ SDValue Result = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, sdl, ResultVT,
+ Data, Mask, PassThru);
+
setValue(&I, Result);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 580ff19065557b..42cbb721703d99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,6 +567,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return "histogram";
+ case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
+ return "extract_last_active";
+
// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
case ISD::SDID: \
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 3b0e9c7526fd0a..cc822ad5ec50e8 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -818,6 +818,9 @@ void TargetLoweringBase::initActions() {
setOperationAction(ISD::SDOPC, VT, Expand);
#include "llvm/IR/VPIntrinsics.def"
+ // Masked vector extracts default to expand.
+ setOperationAction(ISD::VECTOR_EXTRACT_LAST_ACTIVE, VT, Expand);
+
// FP environment operations default to expand.
setOperationAction(ISD::GET_FPENV, VT, Expand);
setOperationAction(ISD::SET_FPENV, VT, Expand);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3ad2905ce52076..14fd7851bfa104 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,6 +401,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
}
+ // TODO: Should we include any other operations here? The calls to
+ // addDRType/addQRType below do mark VSELECT as Expand for the
+ // specified VTs, but leave other illegal types as the default
+ // of 'Legal'. LegalizeDAG doesn't legalize VSELECT after type
+ // legalization if LegalizeVectorOps introduces one.
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index 5212acc6fca0f4..a0e9c6607042f6 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -7,21 +7,26 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; NEON-FIXED: // %bb.0:
; NEON-FIXED-NEXT: sub sp, sp, #16
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0
+; NEON-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
+; NEON-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
-; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
-; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
-; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: mov x11, sp
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b
+; NEON-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
+; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; NEON-FIXED-NEXT: umaxv b1, v1.16b
-; NEON-FIXED-NEXT: umaxv b2, v2.16b
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
-; NEON-FIXED-NEXT: ldrb w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
+; NEON-FIXED-NEXT: fmov w10, s1
+; NEON-FIXED-NEXT: fmov x8, d2
+; NEON-FIXED-NEXT: bfxil x11, x10, #0, #4
+; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; NEON-FIXED-NEXT: lsr x9, x8, #16
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: ldrb w9, [x11]
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; NEON-FIXED-NEXT: tst w8, #0xff
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -29,20 +34,25 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; SVE-FIXED: // %bb.0:
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: index z2.b, #0, #1
-; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0
-; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
+; SVE-FIXED-NEXT: index z4.b, #0, #1
+; SVE-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b
+; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; SVE-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
; SVE-FIXED-NEXT: umaxv b1, v1.16b
-; SVE-FIXED-NEXT: umaxv b2, v2.16b
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
-; SVE-FIXED-NEXT: ldrb w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
+; SVE-FIXED-NEXT: fmov x8, d2
+; SVE-FIXED-NEXT: fmov w10, s1
+; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; SVE-FIXED-NEXT: bfxil x11, x10, #0, #4
+; SVE-FIXED-NEXT: lsr x9, x8, #16
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: ldrb w9, [x11]
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; SVE-FIXED-NEXT: tst w8, #0xff
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <16 x i8> %mask, zeroinitializer
@@ -57,19 +67,22 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v2.8b, v1.8h
-; NEON-FIXED-NEXT: umaxv h1, v1.8h
-; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
-; NEON-FIXED-NEXT: umaxv b2, v2.8b
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
-; NEON-FIXED-NEXT: ldrh w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: xtn v1.8b, v1.8h
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: umaxv b1, v2.8b
+; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; NEON-FIXED-NEXT: lsr x9, x8, #16
+; NEON-FIXED-NEXT: fmov w10, s1
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; NEON-FIXED-NEXT: bfi x11, x10, #1, #3
+; NEON-FIXED-NEXT: tst w8, #0xff
+; NEON-FIXED-NEXT: ldrh w9, [x11]
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -78,19 +91,22 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
-; SVE-FIXED-NEXT: index z3.b, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: index z2.b, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v2.8b, v1.8h
-; SVE-FIXED-NEXT: umaxv h1, v1.8h
-; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
-; SVE-FIXED-NEXT: umaxv b2, v2.8b
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
-; SVE-FIXED-NEXT: ldrh w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: xtn v1.8b, v1.8h
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: umaxv b1, v2.8b
+; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
+; SVE-FIXED-NEXT: lsr x9, x8, #16
+; SVE-FIXED-NEXT: fmov w10, s1
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
+; SVE-FIXED-NEXT: bfi x11, x10, #1, #3
+; SVE-FIXED-NEXT: tst w8, #0xff
+; SVE-FIXED-NEXT: ldrh w9, [x11]
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <8 x i16> %mask, zeroinitializer
@@ -105,19 +121,21 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v2.4h, v1.4s
-; NEON-FIXED-NEXT: umaxv s1, v1.4s
-; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxv h2, v2.4h
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: ldr w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel w0, w8, w0, ne
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s2
+; NEON-FIXED-NEXT: tst w8, #0xffff
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr w9, [x11]
+; NEON-FIXED-NEXT: csel w0, w9, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -126,19 +144,21 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT: index z3.h, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: index z2.h, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v2.4h, v1.4s
-; SVE-FIXED-NEXT: umaxv s1, v1.4s
-; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxv h2, v2.4h
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: ldr w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel w0, w8, w0, ne
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s2
+; SVE-FIXED-NEXT: tst w8, #0xffff
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr w9, [x11]
+; SVE-FIXED-NEXT: csel w0, w9, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <4 x i32> %mask, zeroinitializer
@@ -153,19 +173,20 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI3_0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI3_0]
+; NEON-FIXED-NEXT: mov x10, sp
+; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v2.2s, v1.2d
-; NEON-FIXED-NEXT: umaxv s1, v1.4s
-; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; NEON-FIXED-NEXT: fmov w8, s2
-; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
-; NEON-FIXED-NEXT: ldr x8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
-; NEON-FIXED-NEXT: csel x0, x8, x0, ne
+; NEON-FIXED-NEXT: fmov w9, s2
+; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: ldr x10, [x10]
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: cmp w8, #0
+; NEON-FIXED-NEXT: csel x0, x10, x0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -174,19 +195,20 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT: index z3.s, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: index z2.s, #0, #1
+; SVE-FIXED-NEXT: mov x10, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v2.2s, v1.2d
-; SVE-FIXED-NEXT: umaxv s1, v1.4s
-; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; SVE-FIXED-NEXT: fmov w8, s2
-; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
-; SVE-FIXED-NEXT: ldr x8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
-; SVE-FIXED-NEXT: csel x0, x8, x0, ne
+; SVE-FIXED-NEXT: fmov w9, s2
+; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: ldr x10, [x10]
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: cmp w8, #0
+; SVE-FIXED-NEXT: csel x0, x10, x0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <2 x i64> %mask, zeroinitializer
@@ -201,18 +223,20 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI4_0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI4_0]
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v3.4h, v1.4s
-; NEON-FIXED-NEXT: umaxv s1, v1.4s
-; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
+; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxv h3, v3.4h
-; NEON-FIXED-NEXT: fmov w8, s3
-; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: fmov w8, s1
-; NEON-FIXED-NEXT: ldr s0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s3
+; NEON-FIXED-NEXT: tst w8, #0xffff
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr s0, [x11]
; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -222,18 +246,20 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT: index z4.h, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: index z3.h, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v3.4h, v1.4s
-; SVE-FIXED-NEXT: umaxv s1, v1.4s
-; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
+; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxv h3, v3.4h
-; SVE-FIXED-NEXT: fmov w8, s3
-; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: fmov w8, s1
-; SVE-FIXED-NEXT: ldr s0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s3
+; SVE-FIXED-NEXT: tst w8, #0xffff
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr s0, [x11]
; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -249,18 +275,19 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI5_0
-; NEON-FIXED-NEXT: mov x9, sp
-; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI5_0]
+; NEON-FIXED-NEXT: mov x10, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v3.2s, v1.2d
-; NEON-FIXED-NEXT: umaxv s1, v1.4s
-; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
+; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
+; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; NEON-FIXED-NEXT: fmov x8, d1
; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; NEON-FIXED-NEXT: fmov w8, s3
-; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
-; NEON-FIXED-NEXT: fmov w8, s1
-; NEON-FIXED-NEXT: ldr d0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: fmov w9, s3
+; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
+; NEON-FIXED-NEXT: lsr x9, x8, #32
+; NEON-FIXED-NEXT: ldr d0, [x10]
+; NEON-FIXED-NEXT: orr w8, w8, w9
+; NEON-FIXED-NEXT: cmp w8, #0
; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -270,18 +297,19 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT: index z4.s, #0, #1
-; SVE-FIXED-NEXT: mov x9, sp
+; SVE-FIXED-NEXT: index z3.s, #0, #1
+; SVE-FIXED-NEXT: mov x10, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v3.2s, v1.2d
-; SVE-FIXED-NEXT: umaxv s1, v1.4s
-; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
+; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
+; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
+; SVE-FIXED-NEXT: fmov x8, d1
; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; SVE-FIXED-NEXT: fmov w8, s3
-; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
-; SVE-FIXED-NEXT: fmov w8, s1
-; SVE-FIXED-NEXT: ldr d0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: fmov w9, s3
+; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
+; SVE-FIXED-NEXT: lsr x9, x8, #32
+; SVE-FIXED-NEXT: ldr d0, [x10]
+; SVE-FIXED-NEXT: orr w8, w8, w9
+; SVE-FIXED-NEXT: cmp w8, #0
; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -318,7 +346,7 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
; CHECK-NEXT: sel z1.h, p0, z1.h, z2.h
; CHECK-NEXT: umaxv h1, p1, z1.h
; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: and x8, x8, #0xffff
; CHECK-NEXT: whilels p2.h, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb w8, p2, z0.h
@@ -337,7 +365,7 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s
; CHECK-NEXT: umaxv s1, p1, z1.s
; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: whilels p2.s, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb w8, p2, z0.s
@@ -356,7 +384,6 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d
; CHECK-NEXT: umaxv d1, p1, z1.d
; CHECK-NEXT: fmov x8, d1
-; CHECK-NEXT: and x8, x8, #0xff
; CHECK-NEXT: whilels p2.d, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb x8, p2, z0.d
@@ -375,7 +402,7 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
; CHECK-NEXT: sel z2.s, p0, z2.s, z3.s
; CHECK-NEXT: umaxv s2, p1, z2.s
; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: whilels p2.s, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb s0, p2, z0.s
@@ -394,7 +421,6 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
; CHECK-NEXT: sel z2.d, p0, z2.d, z3.d
; CHECK-NEXT: umaxv d2, p1, z2.d
; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: and x8, x8, #0xff
; CHECK-NEXT: whilels p2.d, xzr, x8
; CHECK-NEXT: ptest p1, p0.b
; CHECK-NEXT: lastb d0, p2, z0.d
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
index 1eef183db21bb3..81ff400b38cb47 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
@@ -76,23 +76,31 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vmsne.vi v0, v9, 0
+; RV32-NEXT: vmsne.vi v9, v9, 0
+; RV32-NEXT: vmv.v.i v0, 1
+; RV32-NEXT: vslidedown.vi v10, v8, 1
+; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; RV32-NEXT: vmv.v.i v12, 0
+; RV32-NEXT: vmv1r.v v11, v10
+; RV32-NEXT: vcpop.m a2, v9
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
+; RV32-NEXT: vrgather.vi v11, v8, 1, v0.t
+; RV32-NEXT: vmv1r.v v0, v9
; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; RV32-NEXT: vmv.v.i v9, 0
-; RV32-NEXT: vcpop.m a2, v0
-; RV32-NEXT: vid.v v9, v0.t
+; RV32-NEXT: vid.v v12, v0.t
; RV32-NEXT: beqz a2, .LBB3_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v9, v9, v9
-; RV32-NEXT: li a1, 32
+; RV32-NEXT: vredmaxu.vs v9, v12, v12
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vslideup.vi v8, v10, 1
+; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
; RV32-NEXT: vmv.x.s a0, v9
; RV32-NEXT: andi a0, a0, 255
-; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vslidedown.vx v9, v11, a0
; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: .LBB3_2:
; RV32-NEXT: ret
;
@@ -168,22 +176,39 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
}
define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) {
-; CHECK-LABEL: extract_last_i8_scalable:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vcpop.m a1, v0
-; CHECK-NEXT: vid.v v10, v0.t
-; CHECK-NEXT: beqz a1, .LBB6_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vredmaxu.vs v10, v10, v10
-; CHECK-NEXT: vmv.x.s a0, v10
-; CHECK-NEXT: andi a0, a0, 255
-; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
-; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: .LBB6_2:
-; CHECK-NEXT: ret
+; RV32-LABEL: extract_last_i8_scalable:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV32-NEXT: vmv.v.i v16, 0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vid.v v16, v0.t
+; RV32-NEXT: beqz a1, .LBB6_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: vredmaxu.vs v10, v16, v16
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: .LBB6_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: extract_last_i8_scalable:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu
+; RV64-NEXT: vmv.v.i v16, 0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vid.v v16, v0.t
+; RV64-NEXT: beqz a1, .LBB6_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: vredmaxu.vs v10, v16, v16
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v8, a0
+; RV64-NEXT: vmv.x.s a0, v8
+; RV64-NEXT: .LBB6_2:
+; RV64-NEXT: ret
%res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
ret i8 %res
}
@@ -191,16 +216,14 @@ define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1>
define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) {
; RV32-LABEL: extract_last_i16_scalable:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV32-NEXT: vmv.v.i v12, 0
; RV32-NEXT: vcpop.m a1, v0
-; RV32-NEXT: vid.v v10, v0.t
+; RV32-NEXT: vid.v v12, v0.t
; RV32-NEXT: beqz a1, .LBB7_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v10, v10
+; RV32-NEXT: vredmaxu.vs v10, v12, v12
; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: slli a0, a0, 16
-; RV32-NEXT: srli a0, a0, 16
; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV32-NEXT: vslidedown.vx v8, v8, a0
; RV32-NEXT: vmv.x.s a0, v8
@@ -209,16 +232,16 @@ define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1
;
; RV64-LABEL: extract_last_i16_scalable:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu
+; RV64-NEXT: vmv.v.i v12, 0
; RV64-NEXT: vcpop.m a1, v0
-; RV64-NEXT: vid.v v10, v0.t
+; RV64-NEXT: vid.v v12, v0.t
; RV64-NEXT: beqz a1, .LBB7_2
; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v10, v10
+; RV64-NEXT: vredmaxu.vs v10, v12, v12
; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 48
-; RV64-NEXT: srli a0, a0, 48
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
; RV64-NEXT: vslidedown.vx v8, v8, a0
; RV64-NEXT: vmv.x.s a0, v8
@@ -269,27 +292,27 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64_scalable:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a2, zero, e64, m2, ta, mu
+; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
; RV32-NEXT: vmv.v.i v10, 0
; RV32-NEXT: vcpop.m a2, v0
; RV32-NEXT: vid.v v10, v0.t
; RV32-NEXT: beqz a2, .LBB9_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: li a1, 32
; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vnsrl.wi v10, v8, 0
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vnsrl.wx v11, v8, a1
+; RV32-NEXT: vslidedown.vx v8, v10, a0
+; RV32-NEXT: vslidedown.vx v9, v11, a0
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
-; RV32-NEXT: vsrl.vx v8, v8, a1
-; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: .LBB9_2:
; RV32-NEXT: ret
;
; RV64-LABEL: extract_last_i64_scalable:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu
+; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu
; RV64-NEXT: vmv.v.i v10, 0
; RV64-NEXT: vcpop.m a1, v0
; RV64-NEXT: vid.v v10, v0.t
@@ -297,6 +320,8 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
; RV64-NEXT: # %bb.1:
; RV64-NEXT: vredmaxu.vs v10, v10, v10
; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; RV64-NEXT: vslidedown.vx v8, v8, a0
; RV64-NEXT: vmv.x.s a0, v8
@@ -345,21 +370,39 @@ define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x
}
define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) {
-; CHECK-LABEL: extract_last_double_scalable:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu
-; CHECK-NEXT: vmv.v.i v10, 0
-; CHECK-NEXT: vcpop.m a0, v0
-; CHECK-NEXT: vid.v v10, v0.t
-; CHECK-NEXT: beqz a0, .LBB11_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: vredmaxu.vs v10, v10, v10
-; CHECK-NEXT: vmv.x.s a0, v10
-; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT: vslidedown.vx v8, v8, a0
-; CHECK-NEXT: vfmv.f.s fa0, v8
-; CHECK-NEXT: .LBB11_2:
-; CHECK-NEXT: ret
+; RV32-LABEL: extract_last_double_scalable:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV32-NEXT: vmv.v.i v10, 0
+; RV32-NEXT: vcpop.m a0, v0
+; RV32-NEXT: vid.v v10, v0.t
+; RV32-NEXT: beqz a0, .LBB11_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: vredmaxu.vs v10, v10, v10
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v8, a0
+; RV32-NEXT: vfmv.f.s fa0, v8
+; RV32-NEXT: .LBB11_2:
+; RV32-NEXT: ret
+;
+; RV64-LABEL: extract_last_double_scalable:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
+; RV64-NEXT: vmv.v.i v10, 0
+; RV64-NEXT: vcpop.m a0, v0
+; RV64-NEXT: vid.v v10, v0.t
+; RV64-NEXT: beqz a0, .LBB11_2
+; RV64-NEXT: # %bb.1:
+; RV64-NEXT: vredmaxu.vs v10, v10, v10
+; RV64-NEXT: vmv.x.s a0, v10
+; RV64-NEXT: slli a0, a0, 32
+; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV64-NEXT: vslidedown.vx v8, v8, a0
+; RV64-NEXT: vfmv.f.s fa0, v8
+; RV64-NEXT: .LBB11_2:
+; RV64-NEXT: ret
%res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
ret double %res
}
>From 3d9358d89bbad957af1f32ede2468385e08a0620 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 8 Jan 2025 14:31:02 +0000
Subject: [PATCH 2/3] Split up selectiondag representation into 3 parts.
---
llvm/include/llvm/CodeGen/ISDOpcodes.h | 6 +-
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +
.../SelectionDAG/LegalizeIntegerTypes.cpp | 22 +-
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 6 +-
.../SelectionDAG/LegalizeTypesGeneric.cpp | 61 ----
.../SelectionDAG/LegalizeVectorOps.cpp | 82 +----
.../SelectionDAG/SelectionDAGBuilder.cpp | 18 +-
.../SelectionDAG/SelectionDAGDumper.cpp | 4 +-
.../CodeGen/SelectionDAG/TargetLowering.cpp | 46 +++
llvm/lib/CodeGen/TargetLoweringBase.cpp | 2 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 10 -
.../AArch64/vector-extract-last-active.ll | 310 +++++++++---------
.../RISCV/rvv/vector-extract-last-active.ll | 293 ++++++-----------
13 files changed, 334 insertions(+), 531 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d2ed8ec2e74663..fd8784a4c10034 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1480,9 +1480,9 @@ enum NodeType {
// Output: Output Chain
EXPERIMENTAL_VECTOR_HISTOGRAM,
- // experimental.vector.extract.last.active intrinsic
- // Operands: Data, Mask, PassThru
- VECTOR_EXTRACT_LAST_ACTIVE,
+ // Finds the index of the last active mask element
+ // Operands: Mask
+ VECTOR_FIND_LAST_ACTIVE,
// llvm.clear_cache intrinsic
// Operands: Input Chain, Start Addres, End Address
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3751aac4df8ead..6edc750ea722d7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5368,6 +5368,11 @@ class TargetLowering : public TargetLoweringBase {
/// \returns The expansion result or SDValue() if it fails.
SDValue expandVPCTTZElements(SDNode *N, SelectionDAG &DAG) const;
+ /// Expand VECTOR_FIND_LAST_ACTIVE nodes
+ /// \param N Node to expand
+ /// \returns The expansion result or SDValue() if it fails.
+ SDValue expandVectorFindLastActive(SDNode *N, SelectionDAG &DAG) const;
+
/// Expand ABS nodes. Expands vector/scalar ABS nodes,
/// vector nodes can only succeed if all operations are legal/custom.
/// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6d75f0788203f8..c519603fae9a28 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -155,8 +155,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = PromoteIntRes_EXTEND_VECTOR_INREG(N); break;
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
- Res = PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(N);
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(N);
break;
case ISD::SIGN_EXTEND:
@@ -2073,8 +2073,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
Res = PromoteIntOp_VECTOR_HISTOGRAM(N, OpNo);
break;
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
- Res = PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(N, OpNo);
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo);
break;
}
@@ -2817,10 +2817,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N,
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
-SDValue
-DAGTypeLegalizer::PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
- unsigned OpNo) {
- SmallVector<SDValue, 3> NewOps(N->ops());
+SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N,
+ unsigned OpNo) {
+ SmallVector<SDValue, 1> NewOps(N->ops());
NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
@@ -2863,9 +2862,6 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
case ISD::EXTRACT_ELEMENT: ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
- ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(N, Lo, Hi);
- break;
case ISD::VAARG: ExpandRes_VAARG(N, Lo, Hi); break;
case ISD::ANY_EXTEND: ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
@@ -6142,10 +6138,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
}
-SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
EVT VT = N->getValueType(0);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
- return DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, SDLoc(N), NVT, N->ops());
+ return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, SDLoc(N), NVT, N->ops());
}
SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 0fc51c33d5f181..069e191d10d7d4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -378,7 +378,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
- SDValue PromoteIntRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N);
+ SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
// Integer Operand Promotion.
bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -429,7 +429,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo);
- SDValue PromoteIntOp_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, unsigned OpNo);
+ SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo);
void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
@@ -1217,8 +1217,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandRes_BUILD_PAIR (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_ELEMENT (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
- void ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N, SDValue &Lo,
- SDValue &Hi);
void ExpandRes_NormalLoad (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandRes_VAARG (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index f7d4800487d609..113a3bc0bbea69 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -19,7 +19,6 @@
//===----------------------------------------------------------------------===//
#include "LegalizeTypes.h"
-#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/DataLayout.h"
using namespace llvm;
@@ -245,66 +244,6 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
std::swap(Lo, Hi);
}
-void DAGTypeLegalizer::ExpandRes_VECTOR_EXTRACT_LAST_ACTIVE(SDNode *N,
- SDValue &Lo,
- SDValue &Hi) {
- SDValue Data = N->getOperand(0);
- SDValue Mask = N->getOperand(1);
- SDValue PassThru = N->getOperand(2);
-
- ElementCount OldEltCount = Data.getValueType().getVectorElementCount();
- EVT OldEltVT = Data.getValueType().getVectorElementType();
- SDLoc dl(N);
-
- EVT OldVT = N->getValueType(0);
- EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
-
- if (OldVT != OldEltVT) {
- // The result of EXTRACT_LAST_ACTIVE may be larger than the element type of
- // the input vector. If so, extend the elements of the input vector to the
- // same bitwidth as the result before expanding.
- assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
- EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldEltCount);
- Data = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
- }
-
- SDValue NewVec = DAG.getNode(
- ISD::BITCAST, dl,
- EVT::getVectorVT(*DAG.getContext(), NewVT, OldEltCount * 2), Data);
-
- auto [DataLo, DataHi] = DAG.SplitVector(NewVec, dl);
- auto [PassLo, PassHi] = DAG.SplitScalar(PassThru, dl, NewVT, NewVT);
-
- EVT SplitVT = DataLo.getValueType();
-
- // TODO: I *think* this works correctly, but I haven't confirmed it yet by
- // actually running a compiled program with example data.
- //
- // We want the matching lo and hi parts from whichever lane was the last
- // active.
- SDValue Deinterleaved;
- if (SplitVT.isFixedLengthVector()) {
- unsigned SplitNum = SplitVT.getVectorMinNumElements();
- SDValue Even = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
- createStrideMask(0, 2, SplitNum));
- SDValue Odd = DAG.getVectorShuffle(SplitVT, dl, DataLo, DataHi,
- createStrideMask(1, 2, SplitNum));
- Deinterleaved = DAG.getMergeValues({Even, Odd}, dl);
- } else
- Deinterleaved =
- DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl,
- DAG.getVTList(SplitVT, SplitVT), DataLo, DataHi);
-
- Lo = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
- Deinterleaved.getValue(0), Mask, PassLo);
- Hi = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, dl, NewVT,
- Deinterleaved.getValue(1), Mask, PassHi);
-
- // FIXME: Endianness?
- assert(!DAG.getDataLayout().isBigEndian() &&
- "Implement big endian result expansion for extract_last_active");
-}
-
void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
SDValue &Hi) {
assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8026e5c845b418..607c70675c988f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -29,7 +29,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -139,7 +138,6 @@ class VectorLegalizer {
SDValue ExpandVP_FNEG(SDNode *Node);
SDValue ExpandVP_FABS(SDNode *Node);
SDValue ExpandVP_FCOPYSIGN(SDNode *Node);
- SDValue ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node);
SDValue ExpandSELECT(SDNode *Node);
std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
SDValue ExpandStore(SDNode *N);
@@ -469,7 +467,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECTOR_COMPRESS:
case ISD::SCMP:
case ISD::UCMP:
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:
@@ -506,6 +503,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::VECREDUCE_FMIN:
case ISD::VECREDUCE_FMAXIMUM:
case ISD::VECREDUCE_FMINIMUM:
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
Action = TLI.getOperationAction(Node->getOpcode(),
Node->getOperand(0).getValueType());
break;
@@ -1211,8 +1209,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
case ISD::VECTOR_COMPRESS:
Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
return;
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
- Results.push_back(ExpandVECTOR_EXTRACT_LAST_ACTIVE(Node));
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Results.push_back(TLI.expandVectorFindLastActive(Node, DAG));
return;
case ISD::SCMP:
case ISD::UCMP:
@@ -1725,80 +1723,6 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) {
return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign);
}
-SDValue VectorLegalizer::ExpandVECTOR_EXTRACT_LAST_ACTIVE(SDNode *Node) {
- SDLoc DL(Node);
- SDValue Data = Node->getOperand(0);
- SDValue Mask = Node->getOperand(1);
- SDValue PassThru = Node->getOperand(2);
-
- EVT DataVT = Data.getValueType();
- EVT ScalarVT = PassThru.getValueType();
- EVT BoolVT = Mask.getValueType().getScalarType();
-
- // Find a suitable type for a stepvector.
- ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
- if (DataVT.isScalableVector())
- VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned EltWidth = TLI.getBitWidthForCttzElements(
- ScalarVT.getTypeForEVT(*DAG.getContext()), DataVT.getVectorElementCount(),
- /*ZeroIsPoison=*/true, &VScaleRange);
-
- // HACK: If the target selects a VT that's too wide based on the legal types
- // for a vecreduce_umax, if will force expansion of the node -- which
- // doesn't work on scalable vectors...
- // Is there another method we could use to get a smaller VT instead
- // of just capping to 32b?
- EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u));
- EVT StepVecVT = DataVT.changeVectorElementType(StepVT);
-
- // HACK: If the target selects a VT that's too small to form a legal vector
- // type, we also run into problems trying to expand the vecreduce_umax.
- //
- // I think perhaps we need to revisit how getBitWidthForCttzElements
- // works...
- if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
- TargetLowering::TypePromoteInteger) {
- StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
- StepVT = StepVecVT.getVectorElementType();
- }
-
- // Zero out lanes with inactive elements, then find the highest remaining
- // value from the stepvector.
- SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
- SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
- SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
-
- // HACK: Unfortunately, LegalizeVectorOps does not recursively legalize *all*
- // added nodes, just the end result nodes until it finds legal ops.
- // LegalizeDAG doesn't handle VSELECT at all presently. So if we need to
- // legalize a vselect then we have to do it here.
- //
- // We might want to change LegalizeVectorOps to walk backwards through the
- // nodes like LegalizeDAG? And share VSELECT legalization code with
- // LegalizeDAG?
- //
- // Or would that cause problems with illegal types that we might have just
- // introduced?
- //
- // Having a legal op with illegal types marked as Legal should work, with the
- // expectation being that type legalization fixes it up later.
- if (TLI.getOperationAction(ISD::VSELECT, StepVecVT) == TargetLowering::Expand)
- ActiveElts = LegalizeOp(ActiveElts);
-
- SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
-
- // Extract the corresponding lane from the data vector
- EVT ExtVT = TLI.getVectorIdxTy(DAG.getDataLayout());
- SDValue Idx = DAG.getZExtOrTrunc(HighestIdx, DL, ExtVT);
- SDValue Extract =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Data, Idx);
-
- // If all mask lanes were inactive, choose the passthru value instead.
- SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, DL, BoolVT, Mask);
- return DAG.getSelect(DL, ScalarVT, AnyActive, Extract, PassThru);
-}
-
void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
SmallVectorImpl<SDValue> &Results) {
// Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index d1c644b0647189..abcc75c5a26abe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6426,17 +6426,25 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
unsigned Intrinsic) {
assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
"Tried lowering invalid vector extract last");
-
SDLoc sdl = getCurSDLoc();
+ const DataLayout &Layout = DAG.getDataLayout();
SDValue Data = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
- SDValue PassThru = getValue(I.getOperand(2));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ EVT ResVT = TLI.getValueType(Layout, I.getType());
- SDValue Result = DAG.getNode(ISD::VECTOR_EXTRACT_LAST_ACTIVE, sdl, ResultVT,
- Data, Mask, PassThru);
+ EVT ExtVT = TLI.getVectorIdxTy(Layout);
+ SDValue Idx = DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, sdl, ExtVT, Mask);
+ SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, sdl, ResVT, Data, Idx);
+
+ Value *Default = I.getOperand(2);
+ if (!isa<PoisonValue>(Default) && !isa<UndefValue>(Default)) {
+ SDValue PassThru = getValue(Default);
+ EVT BoolVT = Mask.getValueType().getScalarType();
+ SDValue AnyActive = DAG.getNode(ISD::VECREDUCE_OR, sdl, BoolVT, Mask);
+ Result = DAG.getSelect(sdl, ResVT, AnyActive, Result, PassThru);
+ }
setValue(&I, Result);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 42cbb721703d99..f63c8dd3df1c83 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,8 +567,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return "histogram";
- case ISD::VECTOR_EXTRACT_LAST_ACTIVE:
- return "extract_last_active";
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ return "find_last_active";
// Vector Predication
#define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9f57884eae04df..809948c8178c75 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12,6 +12,7 @@
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/CodeGenCommonISel.h"
@@ -9453,6 +9454,51 @@ SDValue TargetLowering::expandVPCTTZElements(SDNode *N,
return DAG.getNode(ISD::VP_REDUCE_UMIN, DL, ResVT, ExtEVL, Select, Mask, EVL);
}
+SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
+ SelectionDAG &DAG) const {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(0);
+ EVT MaskVT = Mask.getValueType();
+ EVT BoolVT = MaskVT.getScalarType();
+
+ // Find a suitable type for a stepvector.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ if (MaskVT.isScalableVector())
+ VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned EltWidth = TLI.getBitWidthForCttzElements(
+ BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(),
+ /*ZeroIsPoison=*/true, &VScaleRange);
+
+ // FIXME: If the target selects a VT that's too wide based on the legal types
+ // for a vecreduce_umax, if will force expansion of the node -- which
+ // doesn't work on scalable vectors...
+ // Is there another method we could use to get a smaller VT instead
+ // of just capping to 32b?
+ EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u));
+ EVT StepVecVT = MaskVT.changeVectorElementType(StepVT);
+
+ // FIXME: If the target selects a VT that's too small to form a legal vector
+ // type, we also run into problems if expanding after type
+ // legalization.
+ //
+ // I think perhaps we need to revisit how getBitWidthForCttzElements
+ // works...
+ if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
+ TargetLowering::TypePromoteInteger) {
+ StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ StepVT = StepVecVT.getVectorElementType();
+ }
+
+ // Zero out lanes with inactive elements, then find the highest remaining
+ // value from the stepvector.
+ SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
+ SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
+ SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
+ SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
+ return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0));
+}
+
SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG,
bool IsNegative) const {
SDLoc dl(N);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index cc822ad5ec50e8..73af0a9a714074 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -819,7 +819,7 @@ void TargetLoweringBase::initActions() {
#include "llvm/IR/VPIntrinsics.def"
// Masked vector extracts default to expand.
- setOperationAction(ISD::VECTOR_EXTRACT_LAST_ACTIVE, VT, Expand);
+ setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand);
// FP environment operations default to expand.
setOperationAction(ISD::GET_FPENV, VT, Expand);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 14fd7851bfa104..3ad2905ce52076 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -401,16 +401,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
}
- // TODO: Should we include any other operations here? The calls to
- // addDRType/addQRType below do mark VSELECT as Expand for the
- // specified VTs, but leave other illegal types as the default
- // of 'Legal'. LegalizeDAG doesn't legalize VSELECT after type
- // legalization if LegalizeVectorOps introduces one.
- for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
- setOperationAction(ISD::VSELECT, VT, Expand);
- for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
- setOperationAction(ISD::VSELECT, VT, Expand);
-
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index a0e9c6607042f6..3b11e67d072e7a 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -7,26 +7,21 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; NEON-FIXED: // %bb.0:
; NEON-FIXED-NEXT: sub sp, sp, #16
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
-; NEON-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
-; NEON-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
+; NEON-FIXED-NEXT: cmeq v2.16b, v1.16b, #0
; NEON-FIXED-NEXT: adrp x8, .LCPI0_0
-; NEON-FIXED-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
-; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
+; NEON-FIXED-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
+; NEON-FIXED-NEXT: mov x9, sp
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
-; NEON-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; NEON-FIXED-NEXT: bic v2.16b, v3.16b, v2.16b
; NEON-FIXED-NEXT: umaxv b1, v1.16b
-; NEON-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
-; NEON-FIXED-NEXT: fmov w10, s1
-; NEON-FIXED-NEXT: fmov x8, d2
-; NEON-FIXED-NEXT: bfxil x11, x10, #0, #4
-; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
-; NEON-FIXED-NEXT: lsr x9, x8, #16
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: ldrb w9, [x11]
-; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
-; NEON-FIXED-NEXT: tst w8, #0xff
-; NEON-FIXED-NEXT: csel w0, w9, w0, ne
+; NEON-FIXED-NEXT: umaxv b2, v2.16b
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfxil x9, x8, #0, #4
+; NEON-FIXED-NEXT: ldrb w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -34,25 +29,20 @@ define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) {
; SVE-FIXED: // %bb.0:
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
-; SVE-FIXED-NEXT: cmtst v2.16b, v1.16b, v1.16b
-; SVE-FIXED-NEXT: index z4.b, #0, #1
-; SVE-FIXED-NEXT: cmeq v1.16b, v1.16b, #0
-; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: index z2.b, #0, #1
+; SVE-FIXED-NEXT: cmeq v3.16b, v1.16b, #0
+; SVE-FIXED-NEXT: cmtst v1.16b, v1.16b, v1.16b
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: ext v3.16b, v2.16b, v2.16b, #8
-; SVE-FIXED-NEXT: bic v1.16b, v4.16b, v1.16b
+; SVE-FIXED-NEXT: bic v2.16b, v2.16b, v3.16b
; SVE-FIXED-NEXT: umaxv b1, v1.16b
-; SVE-FIXED-NEXT: orr v2.8b, v2.8b, v3.8b
-; SVE-FIXED-NEXT: fmov x8, d2
-; SVE-FIXED-NEXT: fmov w10, s1
-; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
-; SVE-FIXED-NEXT: bfxil x11, x10, #0, #4
-; SVE-FIXED-NEXT: lsr x9, x8, #16
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: ldrb w9, [x11]
-; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
-; SVE-FIXED-NEXT: tst w8, #0xff
-; SVE-FIXED-NEXT: csel w0, w9, w0, ne
+; SVE-FIXED-NEXT: umaxv b2, v2.16b
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfxil x9, x8, #0, #4
+; SVE-FIXED-NEXT: ldrb w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <16 x i8> %mask, zeroinitializer
@@ -67,22 +57,19 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
; NEON-FIXED-NEXT: adrp x8, .LCPI1_0
-; NEON-FIXED-NEXT: mov x11, sp
-; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI1_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v1.8b, v1.8h
-; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: fmov x8, d1
-; NEON-FIXED-NEXT: umaxv b1, v2.8b
-; NEON-FIXED-NEXT: orr x8, x8, x8, lsr #32
-; NEON-FIXED-NEXT: lsr x9, x8, #16
-; NEON-FIXED-NEXT: fmov w10, s1
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #8
-; NEON-FIXED-NEXT: bfi x11, x10, #1, #3
-; NEON-FIXED-NEXT: tst w8, #0xff
-; NEON-FIXED-NEXT: ldrh w9, [x11]
-; NEON-FIXED-NEXT: csel w0, w9, w0, ne
+; NEON-FIXED-NEXT: xtn v2.8b, v1.8h
+; NEON-FIXED-NEXT: umaxv h1, v1.8h
+; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; NEON-FIXED-NEXT: umaxv b2, v2.8b
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
+; NEON-FIXED-NEXT: ldrh w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -91,22 +78,19 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.8h, v1.8h, v1.8h
-; SVE-FIXED-NEXT: index z2.b, #0, #1
-; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: index z3.b, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v1.8b, v1.8h
-; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: fmov x8, d1
-; SVE-FIXED-NEXT: umaxv b1, v2.8b
-; SVE-FIXED-NEXT: orr x8, x8, x8, lsr #32
-; SVE-FIXED-NEXT: lsr x9, x8, #16
-; SVE-FIXED-NEXT: fmov w10, s1
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #8
-; SVE-FIXED-NEXT: bfi x11, x10, #1, #3
-; SVE-FIXED-NEXT: tst w8, #0xff
-; SVE-FIXED-NEXT: ldrh w9, [x11]
-; SVE-FIXED-NEXT: csel w0, w9, w0, ne
+; SVE-FIXED-NEXT: xtn v2.8b, v1.8h
+; SVE-FIXED-NEXT: umaxv h1, v1.8h
+; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
+; SVE-FIXED-NEXT: umaxv b2, v2.8b
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
+; SVE-FIXED-NEXT: ldrh w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <8 x i16> %mask, zeroinitializer
@@ -121,21 +105,19 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI2_0
-; NEON-FIXED-NEXT: mov x11, sp
-; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI2_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
-; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: xtn v2.4h, v1.4s
+; NEON-FIXED-NEXT: umaxv s1, v1.4s
+; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
; NEON-FIXED-NEXT: umaxv h2, v2.4h
-; NEON-FIXED-NEXT: lsr x9, x8, #32
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
-; NEON-FIXED-NEXT: fmov w10, s2
-; NEON-FIXED-NEXT: tst w8, #0xffff
-; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
-; NEON-FIXED-NEXT: ldr w9, [x11]
-; NEON-FIXED-NEXT: csel w0, w9, w0, ne
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: ldr w8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel w0, w8, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -144,21 +126,19 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT: index z2.h, #0, #1
-; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: index z3.h, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
-; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: xtn v2.4h, v1.4s
+; SVE-FIXED-NEXT: umaxv s1, v1.4s
+; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
; SVE-FIXED-NEXT: umaxv h2, v2.4h
-; SVE-FIXED-NEXT: lsr x9, x8, #32
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
-; SVE-FIXED-NEXT: fmov w10, s2
-; SVE-FIXED-NEXT: tst w8, #0xffff
-; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
-; SVE-FIXED-NEXT: ldr w9, [x11]
-; SVE-FIXED-NEXT: csel w0, w9, w0, ne
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: ldr w8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel w0, w8, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <4 x i32> %mask, zeroinitializer
@@ -173,20 +153,19 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI3_0
-; NEON-FIXED-NEXT: mov x10, sp
-; NEON-FIXED-NEXT: ldr d2, [x8, :lo12:.LCPI3_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI3_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
-; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: xtn v2.2s, v1.2d
+; NEON-FIXED-NEXT: umaxv s1, v1.4s
+; NEON-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; NEON-FIXED-NEXT: fmov w9, s2
-; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
-; NEON-FIXED-NEXT: lsr x9, x8, #32
-; NEON-FIXED-NEXT: ldr x10, [x10]
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: cmp w8, #0
-; NEON-FIXED-NEXT: csel x0, x10, x0, ne
+; NEON-FIXED-NEXT: fmov w8, s2
+; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT: ldr x8, [x9]
+; NEON-FIXED-NEXT: fmov w9, s1
+; NEON-FIXED-NEXT: tst w9, #0x1
+; NEON-FIXED-NEXT: csel x0, x8, x0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
;
@@ -195,20 +174,19 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT: index z2.s, #0, #1
-; SVE-FIXED-NEXT: mov x10, sp
+; SVE-FIXED-NEXT: index z3.s, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
-; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: xtn v2.2s, v1.2d
+; SVE-FIXED-NEXT: umaxv s1, v1.4s
+; SVE-FIXED-NEXT: and v2.8b, v2.8b, v3.8b
; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
-; SVE-FIXED-NEXT: fmov w9, s2
-; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
-; SVE-FIXED-NEXT: lsr x9, x8, #32
-; SVE-FIXED-NEXT: ldr x10, [x10]
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: cmp w8, #0
-; SVE-FIXED-NEXT: csel x0, x10, x0, ne
+; SVE-FIXED-NEXT: fmov w8, s2
+; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT: ldr x8, [x9]
+; SVE-FIXED-NEXT: fmov w9, s1
+; SVE-FIXED-NEXT: tst w9, #0x1
+; SVE-FIXED-NEXT: csel x0, x8, x0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
%notzero = icmp ne <2 x i64> %mask, zeroinitializer
@@ -223,20 +201,18 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
; NEON-FIXED-NEXT: adrp x8, .LCPI4_0
-; NEON-FIXED-NEXT: mov x11, sp
-; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI4_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI4_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
-; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: xtn v3.4h, v1.4s
+; NEON-FIXED-NEXT: umaxv s1, v1.4s
+; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
; NEON-FIXED-NEXT: umaxv h3, v3.4h
-; NEON-FIXED-NEXT: lsr x9, x8, #32
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: orr w8, w8, w8, lsr #16
-; NEON-FIXED-NEXT: fmov w10, s3
-; NEON-FIXED-NEXT: tst w8, #0xffff
-; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
-; NEON-FIXED-NEXT: ldr s0, [x11]
+; NEON-FIXED-NEXT: fmov w8, s3
+; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
+; NEON-FIXED-NEXT: fmov w8, s1
+; NEON-FIXED-NEXT: ldr s0, [x9]
+; NEON-FIXED-NEXT: tst w8, #0x1
; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -246,20 +222,18 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT: index z3.h, #0, #1
-; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: index z4.h, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
-; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: xtn v3.4h, v1.4s
+; SVE-FIXED-NEXT: umaxv s1, v1.4s
+; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
; SVE-FIXED-NEXT: umaxv h3, v3.4h
-; SVE-FIXED-NEXT: lsr x9, x8, #32
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: orr w8, w8, w8, lsr #16
-; SVE-FIXED-NEXT: fmov w10, s3
-; SVE-FIXED-NEXT: tst w8, #0xffff
-; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
-; SVE-FIXED-NEXT: ldr s0, [x11]
+; SVE-FIXED-NEXT: fmov w8, s3
+; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
+; SVE-FIXED-NEXT: fmov w8, s1
+; SVE-FIXED-NEXT: ldr s0, [x9]
+; SVE-FIXED-NEXT: tst w8, #0x1
; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -275,19 +249,18 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
; NEON-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
; NEON-FIXED-NEXT: adrp x8, .LCPI5_0
-; NEON-FIXED-NEXT: mov x10, sp
-; NEON-FIXED-NEXT: ldr d3, [x8, :lo12:.LCPI5_0]
+; NEON-FIXED-NEXT: mov x9, sp
+; NEON-FIXED-NEXT: ldr d4, [x8, :lo12:.LCPI5_0]
; NEON-FIXED-NEXT: str q0, [sp]
-; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
-; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: xtn v3.2s, v1.2d
+; NEON-FIXED-NEXT: umaxv s1, v1.4s
+; NEON-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; NEON-FIXED-NEXT: fmov w9, s3
-; NEON-FIXED-NEXT: bfi x10, x9, #3, #1
-; NEON-FIXED-NEXT: lsr x9, x8, #32
-; NEON-FIXED-NEXT: ldr d0, [x10]
-; NEON-FIXED-NEXT: orr w8, w8, w9
-; NEON-FIXED-NEXT: cmp w8, #0
+; NEON-FIXED-NEXT: fmov w8, s3
+; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
+; NEON-FIXED-NEXT: fmov w8, s1
+; NEON-FIXED-NEXT: ldr d0, [x9]
+; NEON-FIXED-NEXT: tst w8, #0x1
; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -297,19 +270,18 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; SVE-FIXED-NEXT: sub sp, sp, #16
; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
; SVE-FIXED-NEXT: cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT: index z3.s, #0, #1
-; SVE-FIXED-NEXT: mov x10, sp
+; SVE-FIXED-NEXT: index z4.s, #0, #1
+; SVE-FIXED-NEXT: mov x9, sp
; SVE-FIXED-NEXT: str q0, [sp]
-; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
-; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: xtn v3.2s, v1.2d
+; SVE-FIXED-NEXT: umaxv s1, v1.4s
+; SVE-FIXED-NEXT: and v3.8b, v3.8b, v4.8b
; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
-; SVE-FIXED-NEXT: fmov w9, s3
-; SVE-FIXED-NEXT: bfi x10, x9, #3, #1
-; SVE-FIXED-NEXT: lsr x9, x8, #32
-; SVE-FIXED-NEXT: ldr d0, [x10]
-; SVE-FIXED-NEXT: orr w8, w8, w9
-; SVE-FIXED-NEXT: cmp w8, #0
+; SVE-FIXED-NEXT: fmov w8, s3
+; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
+; SVE-FIXED-NEXT: fmov w8, s1
+; SVE-FIXED-NEXT: ldr d0, [x9]
+; SVE-FIXED-NEXT: tst w8, #0x1
; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -430,6 +402,24 @@ define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale
ret double %res
}
+;; If the passthru parameter is poison, we shouldn't see a select at the end.
+define i8 @extract_last_i8_scalable_poison_passthru(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask) #0 {
+; CHECK-LABEL: extract_last_i8_scalable_poison_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: index z1.b, #0, #1
+; CHECK-NEXT: mov z2.b, #0 // =0x0
+; CHECK-NEXT: sel z1.b, p0, z1.b, z2.b
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: umaxv b1, p0, z1.b
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: and x8, x8, #0xff
+; CHECK-NEXT: whilels p0.b, xzr, x8
+; CHECK-NEXT: lastb w0, p0, z0.b
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 poison)
+ ret i8 %res
+}
+
declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
index 81ff400b38cb47..10929394af75ff 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-extract-last-active.ll
@@ -76,31 +76,23 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT: vmsne.vi v9, v9, 0
-; RV32-NEXT: vmv.v.i v0, 1
-; RV32-NEXT: vslidedown.vi v10, v8, 1
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
-; RV32-NEXT: vmv.v.i v12, 0
-; RV32-NEXT: vmv1r.v v11, v10
-; RV32-NEXT: vcpop.m a2, v9
-; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, mu
-; RV32-NEXT: vrgather.vi v11, v8, 1, v0.t
-; RV32-NEXT: vmv1r.v v0, v9
+; RV32-NEXT: vmsne.vi v0, v9, 0
; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu
-; RV32-NEXT: vid.v v12, v0.t
+; RV32-NEXT: vmv.v.i v9, 0
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vid.v v9, v0.t
; RV32-NEXT: beqz a2, .LBB3_2
; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v9, v12, v12
-; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT: vslideup.vi v8, v10, 1
-; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma
+; RV32-NEXT: vredmaxu.vs v9, v9, v9
+; RV32-NEXT: li a1, 32
; RV32-NEXT: vmv.x.s a0, v9
; RV32-NEXT: andi a0, a0, 255
-; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT: vslidedown.vx v9, v11, a0
+; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma
; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vmv.x.s a1, v9
; RV32-NEXT: vmv.x.s a0, v8
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v8, v8, a1
+; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: .LBB3_2:
; RV32-NEXT: ret
;
@@ -176,115 +168,64 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
}
define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) {
-; RV32-LABEL: extract_last_i8_scalable:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu
-; RV32-NEXT: vmv.v.i v16, 0
-; RV32-NEXT: vcpop.m a1, v0
-; RV32-NEXT: vid.v v16, v0.t
-; RV32-NEXT: beqz a1, .LBB6_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v16, v16
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: .LBB6_2:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extract_last_i8_scalable:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu
-; RV64-NEXT: vmv.v.i v16, 0
-; RV64-NEXT: vcpop.m a1, v0
-; RV64-NEXT: vid.v v16, v0.t
-; RV64-NEXT: beqz a1, .LBB6_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v16, v16
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: vsetvli zero, zero, e8, m2, ta, ma
-; RV64-NEXT: vslidedown.vx v8, v8, a0
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: .LBB6_2:
-; RV64-NEXT: ret
+; CHECK-LABEL: extract_last_i8_scalable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vid.v v10, v0.t
+; CHECK-NEXT: beqz a1, .LBB6_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vredmaxu.vs v10, v10, v10
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: ret
%res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru)
ret i8 %res
}
define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) {
-; RV32-LABEL: extract_last_i16_scalable:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV32-NEXT: vmv.v.i v12, 0
-; RV32-NEXT: vcpop.m a1, v0
-; RV32-NEXT: vid.v v12, v0.t
-; RV32-NEXT: beqz a1, .LBB7_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v12, v12
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: .LBB7_2:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extract_last_i16_scalable:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu
-; RV64-NEXT: vmv.v.i v12, 0
-; RV64-NEXT: vcpop.m a1, v0
-; RV64-NEXT: vid.v v12, v0.t
-; RV64-NEXT: beqz a1, .LBB7_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v12, v12
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT: vslidedown.vx v8, v8, a0
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: .LBB7_2:
-; RV64-NEXT: ret
+; CHECK-LABEL: extract_last_i16_scalable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vid.v v10, v0.t
+; CHECK-NEXT: beqz a1, .LBB7_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vredmaxu.vs v10, v10, v10
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: ret
%res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru)
ret i16 %res
}
define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) {
-; RV32-LABEL: extract_last_i32_scalable:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vcpop.m a1, v0
-; RV32-NEXT: vid.v v10, v0.t
-; RV32-NEXT: beqz a1, .LBB8_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: .LBB8_2:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extract_last_i32_scalable:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vcpop.m a1, v0
-; RV64-NEXT: vid.v v10, v0.t
-; RV64-NEXT: beqz a1, .LBB8_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v10, v10
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vx v8, v8, a0
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: .LBB8_2:
-; RV64-NEXT: ret
+; CHECK-LABEL: extract_last_i32_scalable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vcpop.m a1, v0
+; CHECK-NEXT: vid.v v10, v0.t
+; CHECK-NEXT: beqz a1, .LBB8_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vredmaxu.vs v10, v10, v10
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: .LBB8_2:
+; CHECK-NEXT: ret
%res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru)
ret i32 %res
}
@@ -292,27 +233,28 @@ define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1
define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) {
; RV32-LABEL: extract_last_i64_scalable:
; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu
+; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, mu
; RV32-NEXT: vmv.v.i v10, 0
; RV32-NEXT: vcpop.m a2, v0
; RV32-NEXT: vid.v v10, v0.t
; RV32-NEXT: beqz a2, .LBB9_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vnsrl.wi v10, v8, 0
; RV32-NEXT: li a1, 32
-; RV32-NEXT: vnsrl.wx v11, v8, a1
-; RV32-NEXT: vslidedown.vx v8, v10, a0
-; RV32-NEXT: vslidedown.vx v9, v11, a0
+; RV32-NEXT: vmv.x.s a0, v10
+; RV32-NEXT: andi a0, a0, 255
+; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; RV32-NEXT: vslidedown.vx v8, v8, a0
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vmv.x.s a1, v9
+; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma
+; RV32-NEXT: vsrl.vx v8, v8, a1
+; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: .LBB9_2:
; RV32-NEXT: ret
;
; RV64-LABEL: extract_last_i64_scalable:
; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu
+; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu
; RV64-NEXT: vmv.v.i v10, 0
; RV64-NEXT: vcpop.m a1, v0
; RV64-NEXT: vid.v v10, v0.t
@@ -320,8 +262,7 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
; RV64-NEXT: # %bb.1:
; RV64-NEXT: vredmaxu.vs v10, v10, v10
; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
+; RV64-NEXT: andi a0, a0, 255
; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
; RV64-NEXT: vslidedown.vx v8, v8, a0
; RV64-NEXT: vmv.x.s a0, v8
@@ -332,77 +273,43 @@ define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1
}
define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) {
-; RV32-LABEL: extract_last_float_scalable:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vcpop.m a0, v0
-; RV32-NEXT: vid.v v10, v0.t
-; RV32-NEXT: beqz a0, .LBB10_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vfmv.f.s fa0, v8
-; RV32-NEXT: .LBB10_2:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extract_last_float_scalable:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vcpop.m a0, v0
-; RV64-NEXT: vid.v v10, v0.t
-; RV64-NEXT: beqz a0, .LBB10_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v10, v10
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT: vslidedown.vx v8, v8, a0
-; RV64-NEXT: vfmv.f.s fa0, v8
-; RV64-NEXT: .LBB10_2:
-; RV64-NEXT: ret
+; CHECK-LABEL: extract_last_float_scalable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vcpop.m a0, v0
+; CHECK-NEXT: vid.v v10, v0.t
+; CHECK-NEXT: beqz a0, .LBB10_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vredmaxu.vs v10, v10, v10
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: ret
%res = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru)
ret float %res
}
define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) {
-; RV32-LABEL: extract_last_double_scalable:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV32-NEXT: vmv.v.i v10, 0
-; RV32-NEXT: vcpop.m a0, v0
-; RV32-NEXT: vid.v v10, v0.t
-; RV32-NEXT: beqz a0, .LBB11_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: vredmaxu.vs v10, v10, v10
-; RV32-NEXT: vmv.x.s a0, v10
-; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV32-NEXT: vslidedown.vx v8, v8, a0
-; RV32-NEXT: vfmv.f.s fa0, v8
-; RV32-NEXT: .LBB11_2:
-; RV32-NEXT: ret
-;
-; RV64-LABEL: extract_last_double_scalable:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu
-; RV64-NEXT: vmv.v.i v10, 0
-; RV64-NEXT: vcpop.m a0, v0
-; RV64-NEXT: vid.v v10, v0.t
-; RV64-NEXT: beqz a0, .LBB11_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: vredmaxu.vs v10, v10, v10
-; RV64-NEXT: vmv.x.s a0, v10
-; RV64-NEXT: slli a0, a0, 32
-; RV64-NEXT: srli a0, a0, 32
-; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT: vslidedown.vx v8, v8, a0
-; RV64-NEXT: vfmv.f.s fa0, v8
-; RV64-NEXT: .LBB11_2:
-; RV64-NEXT: ret
+; CHECK-LABEL: extract_last_double_scalable:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: vcpop.m a0, v0
+; CHECK-NEXT: vid.v v10, v0.t
+; CHECK-NEXT: beqz a0, .LBB11_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vredmaxu.vs v10, v10, v10
+; CHECK-NEXT: vmv.x.s a0, v10
+; CHECK-NEXT: andi a0, a0, 255
+; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v8, a0
+; CHECK-NEXT: vfmv.f.s fa0, v8
+; CHECK-NEXT: .LBB11_2:
+; CHECK-NEXT: ret
%res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru)
ret double %res
}
>From 52e55c0983d96616ea42e8c70ad54bbcd9a07b2c Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 13 Jan 2025 15:55:56 +0000
Subject: [PATCH 3/3] Remove 32b hardcoded upper bound, clarify type promotion
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 20 ++++++-------------
1 file changed, 6 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 809948c8178c75..90ac79cfb0e3b9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9462,28 +9462,20 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
EVT BoolVT = MaskVT.getScalarType();
// Find a suitable type for a stepvector.
- ConstantRange VScaleRange(1, /*isFullSet=*/true); // Dummy value.
+ ConstantRange VScaleRange(1, /*isFullSet=*/true); // Fixed length default.
if (MaskVT.isScalableVector())
VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned EltWidth = TLI.getBitWidthForCttzElements(
BoolVT.getTypeForEVT(*DAG.getContext()), MaskVT.getVectorElementCount(),
/*ZeroIsPoison=*/true, &VScaleRange);
-
- // FIXME: If the target selects a VT that's too wide based on the legal types
- // for a vecreduce_umax, if will force expansion of the node -- which
- // doesn't work on scalable vectors...
- // Is there another method we could use to get a smaller VT instead
- // of just capping to 32b?
- EVT StepVT = MVT::getIntegerVT(std::min(EltWidth, 32u));
+ EVT StepVT = MVT::getIntegerVT(EltWidth);
EVT StepVecVT = MaskVT.changeVectorElementType(StepVT);
- // FIXME: If the target selects a VT that's too small to form a legal vector
- // type, we also run into problems if expanding after type
- // legalization.
- //
- // I think perhaps we need to revisit how getBitWidthForCttzElements
- // works...
+ // If promotion is required to make the type legal, do it here; promotion
+ // of integers within LegalizeVectorOps is looking for types of the same
+ // size but with a smaller number of larger elements, not the usual larger
+ // size with the same number of larger elements.
if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
TargetLowering::TypePromoteInteger) {
StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
More information about the llvm-commits
mailing list