[llvm] [LV] Enable scalable FindLast on RISCV. (PR #184931)

Thu Mar 5 17:56:25 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Elvis Wang (ElvisWang123)

<details>
<summary>Changes</summary>

This patch enables FindLast reduction vectorized with scalable vector on
RISCV.

Note that this patch only makes FindLast can be vectorized with scalable
vector. It doesn't work well with EVL tail-folding since the
header-mask still has user inside loop body. So the header mask cannot
be fully removed.

Based on #184067 (TTI) . Split off from #184068 (optimize for EVL).

---

Patch is 28.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/184931.diff


4 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+39) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+1) 
- (added) llvm/test/Analysis/CostModel/RISCV/extract-last-active.ll (+108) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment-fold-tail.ll (+138) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ffb64d9f52beb..22ac00fbd926f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1702,6 +1702,45 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                CmpInst::FCMP_UNO, CostKind);
     return Cost;
   }
+  case Intrinsic::experimental_vector_extract_last_active: {
+    Type *ValTy = ICA.getArgTypes()[0];
+    Type *MaskTy = ICA.getArgTypes()[1];
+
+    auto ValLT = getTypeLegalizationCost(ValTy);
+    auto MaskLT = getTypeLegalizationCost(MaskTy);
+
+    // TODO: Return cheaper cost when the entire lane is inactive.
+    // The expected asm sequence is:
+    // vcpop.m a0, v0
+    // beqz a0, exit # Return passthru when the entire lane is inactive.
+    // vid v10, v0.t
+    // vredmaxu.vs v10, v10, v10
+    // vmv.x.s a0, v10
+    // zext.b a0, a0
+    // vslidedown.vx v8, v8, a0
+    // vmv.x.s a0, v8
+    // exit:
+    //   ...
+    auto *Int8Ty = Type::getInt8Ty(ValTy->getContext());
+    auto *Int8VecTy =
+        VectorType::get(Int8Ty, cast<VectorType>(ValTy)->getElementCount());
+    auto Int8LT = getTypeLegalizationCost(Int8VecTy);
+    InstructionCost Cost = 0;
+    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
+
+    Cost += MaskLT.first *
+            getRISCVInstructionCost(RISCV::VCPOP_M, MaskLT.second, CostKind);
+    Cost += getCFInstrCost(Instruction::Br, CostKind, nullptr);
+    Cost += Int8LT.first *
+            getRISCVInstructionCost(Opcodes, Int8LT.second, CostKind);
+    Cost += getCastInstrCost(Instruction::ZExt,
+                             Type::getInt64Ty(ValTy->getContext()), Int8Ty,
+                             TTI::CastContextHint::None, CostKind, nullptr);
+    Cost += ValLT.first *
+            getRISCVInstructionCost({RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
+                                    ValLT.second, CostKind);
+    return Cost;
+  }
   }
 
   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 424f9fe52c59e..483a774c218eb 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -394,6 +394,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
     case RecurKind::UMax:
     case RecurKind::FMin:
     case RecurKind::FMax:
+    case RecurKind::FindLast:
       return true;
     case RecurKind::AnyOf:
     case RecurKind::FAdd:
diff --git a/llvm/test/Analysis/CostModel/RISCV/extract-last-active.ll b/llvm/test/Analysis/CostModel/RISCV/extract-last-active.ll
new file mode 100644
index 0000000000000..d6b00a8bf8a79
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/extract-last-active.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
+
+define void @extractions() {
+; CHECK-LABEL: 'extractions'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v16i8 = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> poison, <16 x i1> poison, i8 poison)
+  %v8i16 = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> poison, <8 x i1> poison, i16 poison)
+  %v4i32 = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> poison, <4 x i1> poison, i32 poison)
+  %v2i64 = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> poison, <2 x i1> poison, i64 poison)
+  %v8f16 = call half @llvm.experimental.vector.extract.last.active.v8f16(<8 x half> poison, <8 x i1> poison, half poison)
+  %v8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v8bf16(<8 x bfloat> poison, <8 x i1> poison, bfloat poison)
+  %v4f32 = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> poison, <4 x i1> poison, float poison)
+  %v2f64 = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> poison, <2 x i1> poison, double poison)
+  %nxv16i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> poison, i8 poison)
+  %nxv8i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> poison, i16 poison)
+  %nxv4i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> poison, i32 poison)
+  %nxv2i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> poison, i64 poison)
+  %nxv8f16 = call half @llvm.experimental.vector.extract.last.active.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> poison, half poison)
+  %nxv8bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> poison, bfloat poison)
+  %nxv4f32 = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> poison, float poison)
+  %nxv2f64 = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> poison, double poison)
+
+  %v32i8 = call i8 @llvm.experimental.vector.extract.last.active.v32i8(<32 x i8> poison, <32 x i1> poison, i8 poison)
+  %v16i16 = call i16 @llvm.experimental.vector.extract.last.active.v16i16(<16 x i16> poison, <16 x i1> poison, i16 poison)
+  %v8i32 = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> poison, <8 x i1> poison, i32 poison)
+  %v4i64 = call i64 @llvm.experimental.vector.extract.last.active.v4i64(<4 x i64> poison, <4 x i1> poison, i64 poison)
+  %v16f16 = call half @llvm.experimental.vector.extract.last.active.v16f16(<16 x half> poison, <16 x i1> poison, half poison)
+  %v16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v16bf16(<16 x bfloat> poison, <16 x i1> poison, bfloat poison)
+  %v8f32 = call float @llvm.experimental.vector.extract.last.active.v8f32(<8 x float> poison, <8 x i1> poison, float poison)
+  %v4f64 = call double @llvm.experimental.vector.extract.last.active.v4f64(<4 x double> poison, <4 x i1> poison, double poison)
+  %nxv32i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv32i8(<vscale x 32 x i8> poison, <vscale x 32 x i1> poison, i8 poison)
+  %nxv16i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv16i16(<vscale x 16 x i16> poison, <vscale x 16 x i1> poison, i16 poison)
+  %nxv8i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv8i32(<vscale x 8 x i32> poison, <vscale x 8 x i1> poison, i32 poison)
+  %nxv4i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i1> poison, i64 poison)
+  %nxv16f16 = call half @llvm.experimental.vector.extract.last.active.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x i1> poison, half poison)
+  %nxv16bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv16bf16(<vscale x 16 x bfloat> poison, <vscale x 16 x i1> poison, bfloat poison)
+  %nxv8f32 = call float @llvm.experimental.vector.extract.last.active.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x i1> poison, float poison)
+  %nxv4f64 = call double @llvm.experimental.vector.extract.last.active.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x i1> poison, double poison)
+
+  %v8i8 = call i8 @llvm.experimental.vector.extract.last.active.v8i8(<8 x i8> poison, <8 x i1> poison, i8 poison)
+  %v4i16 = call i16 @llvm.experimental.vector.extract.last.active.v4i16(<4 x i16> poison, <4 x i1> poison, i16 poison)
+  %v2i32 = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> poison, <2 x i1> poison, i32 poison)
+  %v1i64 = call i64 @llvm.experimental.vector.extract.last.active.v1i64(<1 x i64> poison, <1 x i1> poison, i64 poison)
+  %v4f16 = call half @llvm.experimental.vector.extract.last.active.v4f16(<4 x half> poison, <4 x i1> poison, half poison)
+  %v4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.v4bf16(<4 x bfloat> poison, <4 x i1> poison, bfloat poison)
+  %v2f32 = call float @llvm.experimental.vector.extract.last.active.v2f32(<2 x float> poison, <2 x i1> poison, float poison)
+  %v1f64 = call double @llvm.experimental.vector.extract.last.active.v1f64(<1 x double> poison, <1 x i1> poison, double poison)
+  %nxv8i8 = call i8 @llvm.experimental.vector.extract.last.active.nxv8i8(<vscale x 8 x i8> poison, <vscale x 8 x i1> poison, i8 poison)
+  %nxv4i16 = call i16 @llvm.experimental.vector.extract.last.active.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i1> poison, i16 poison)
+  %nxv2i32 = call i32 @llvm.experimental.vector.extract.last.active.nxv2i32(<vscale x 2 x i32> poison, <vscale x 2 x i1> poison, i32 poison)
+  %nxv1i64 = call i64 @llvm.experimental.vector.extract.last.active.nxv1i64(<vscale x 1 x i64> poison, <vscale x 1 x i1> poison, i64 poison)
+  %nxv4f16 = call half @llvm.experimental.vector.extract.last.active.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> poison, half poison)
+  %nxv4bf16 = call bfloat @llvm.experimental.vector.extract.last.active.nxv4bf16(<vscale x 4 x bfloat> poison, <vscale x 4 x i1> poison, bfloat poison)
+  %nxv2f32 = call float @llvm.experimental.vector.extract.last.active.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> poison, float poison)
+  %nxv1f64 = call double @llvm.experimental.vector.extract.last.active.nxv1f64(<vscale x 1 x double> poison, <vscale x 1 x i1> poison, double poison)
+
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment-fold-tail.ll
new file mode 100644
index 0000000000000..cb907d0d39cc9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/conditional-scalar-assignment-fold-tail.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -mtriple riscv64 -mattr=+v -S < %s | FileCheck %s
+
+define i32 @simple_find_last_reduction(i64 %N, ptr %data, i32 %a) {
+; CHECK-LABEL: define i32 @simple_f...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/184931