[llvm] 00e40c9 - [LV] Support binary and unary operations with EVL-vectorization (#93854)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 6 08:41:40 PDT 2024
Author: Kolya Panchenko
Date: 2024-09-06T11:41:36-04:00
New Revision: 00e40c9b5b1a8208c4f2b785138d79ad0e9107af
URL: https://github.com/llvm/llvm-project/commit/00e40c9b5b1a8208c4f2b785138d79ad0e9107af
DIFF: https://github.com/llvm/llvm-project/commit/00e40c9b5b1a8208c4f2b785138d79ad0e9107af.diff
LOG: [LV] Support binary and unary operations with EVL-vectorization (#93854)
The patch adds `VPWidenEVLRecipe` which represents `VPWidenRecipe` + EVL
argument. The new recipe replaces `VPWidenRecipe` in
`tryAddExplicitVectorLength` for each binary and unary operations.
Follow up patches will extend support for remaining cases, like `FCmp`
and `ICmp`
Added:
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
Modified:
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/lib/Transforms/Vectorize/VPlanValue.h
llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9ad98a5371d814..cec3135395483b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -923,6 +923,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPWidenCastSC:
case VPRecipeBase::VPWidenGEPSC:
case VPRecipeBase::VPWidenSC:
+ case VPRecipeBase::VPWidenEVLSC:
case VPRecipeBase::VPWidenSelectSC:
case VPRecipeBase::VPBlendSC:
case VPRecipeBase::VPPredInstPHISC:
@@ -1107,6 +1108,7 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
static inline bool classof(const VPRecipeBase *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC ||
R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenGEPSC ||
R->getVPDefID() == VPRecipeBase::VPWidenCastSC ||
R->getVPDefID() == VPRecipeBase::VPReplicateSC ||
@@ -1410,11 +1412,16 @@ class VPInstruction : public VPRecipeWithIRFlags {
class VPWidenRecipe : public VPRecipeWithIRFlags {
unsigned Opcode;
+protected:
+ template <typename IterT>
+ VPWidenRecipe(unsigned VPDefOpcode, Instruction &I,
+ iterator_range<IterT> Operands)
+ : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {}
+
public:
template <typename IterT>
VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
- : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
- Opcode(I.getOpcode()) {}
+ : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {}
~VPWidenRecipe() override = default;
@@ -1424,7 +1431,15 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
return R;
}
- VP_CLASSOF_IMPL(VPDef::VPWidenSC)
+ static inline bool classof(const VPRecipeBase *R) {
+ return R->getVPDefID() == VPRecipeBase::VPWidenSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenEVLSC;
+ }
+
+ static inline bool classof(const VPUser *U) {
+ auto *R = dyn_cast<VPRecipeBase>(U);
+ return R && classof(R);
+ }
/// Produce a widened instruction using the opcode and operands of the recipe,
/// processing State.VF elements.
@@ -1443,6 +1458,54 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
#endif
};
+/// A recipe for widening operations with vector-predication intrinsics with
+/// explicit vector length (EVL).
+class VPWidenEVLRecipe : public VPWidenRecipe {
+ using VPRecipeWithIRFlags::transferFlags;
+
+public:
+ template <typename IterT>
+ VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL)
+ : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) {
+ addOperand(&EVL);
+ }
+ VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL)
+ : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) {
+ transferFlags(W);
+ }
+
+ ~VPWidenEVLRecipe() override = default;
+
+ VPWidenRecipe *clone() override final {
+ llvm_unreachable("VPWidenEVLRecipe cannot be cloned");
+ return nullptr;
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC);
+
+ VPValue *getEVL() { return getOperand(getNumOperands() - 1); }
+ const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+ /// Produce a vp-intrinsic using the opcode and operands of the recipe,
+ /// processing EVL elements.
+ void execute(VPTransformState &State) override final;
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ // EVL in that recipe is always the last operand, thus any use before means
+ // the VPValue should be vectorized.
+ return getEVL() == Op;
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override final;
+#endif
+};
+
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
class VPWidenCastRecipe : public VPRecipeWithIRFlags {
/// Cast instruction opcode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d79a7e814ecb3a..f091ee5a71b297 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -263,8 +263,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
return inferScalarType(R->getOperand(0));
})
- .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
+ VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
+ VPWidenSelectRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7708763aef859d..ea49d29e28d831 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -24,6 +24,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
+#include "llvm/IR/VectorBuilder.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -74,6 +75,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenLoadSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -114,6 +116,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
case VPWidenIntOrFpInductionSC:
case VPWidenPHISC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -164,6 +167,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPWidenPHISC:
case VPWidenPointerInductionSC:
case VPWidenSC:
+ case VPWidenEVLSC:
case VPWidenSelectSC: {
const Instruction *I =
dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
@@ -1262,6 +1266,45 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
}
}
+void VPWidenEVLRecipe::execute(VPTransformState &State) {
+ unsigned Opcode = getOpcode();
+ // TODO: Support other opcodes
+ if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
+ llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
+
+ State.setDebugLocFrom(getDebugLoc());
+ assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+ "explicit vector length.");
+ VPValue *Op0 = getOperand(0);
+
+ assert(State.get(Op0, 0)->getType()->isVectorTy() &&
+ "VPWidenEVLRecipe should not be used for scalars");
+
+ VPValue *EVL = getEVL();
+ Value *EVLArg = State.get(EVL, 0, /*NeedsScalar=*/true);
+ IRBuilderBase &BuilderIR = State.Builder;
+ VectorBuilder Builder(BuilderIR);
+ Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+
+ SmallVector<Value *, 4> Ops;
+ for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+ VPValue *VPOp = getOperand(I);
+ Ops.push_back(State.get(VPOp, 0));
+ }
+
+ Builder.setMask(Mask).setEVL(EVLArg);
+ Value *VPInst =
+ Builder.createVectorInstruction(Opcode, Ops[0]->getType(), Ops, "vp.op");
+ // Currently vp-intrinsics only accept FMF flags.
+ // TODO: Enable other flags when support is added.
+ if (isa<FPMathOperator>(VPInst))
+ setFlags(cast<Instruction>(VPInst));
+
+ State.set(this, VPInst, 0);
+ State.addMetadata(VPInst,
+ dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
@@ -1271,6 +1314,15 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
printFlags(O);
printOperands(O, SlotTracker);
}
+
+void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN-VP ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(getOpcode());
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
#endif
void VPWidenCastRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 39f7bf55ee5eb7..eaf54464457d5e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -13,6 +13,7 @@
#include "VPlanTransforms.h"
#include "VPRecipeBuilder.h"
+#include "VPlan.h"
#include "VPlanAnalysis.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
@@ -21,6 +22,7 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Intrinsics.h"
@@ -1316,6 +1318,63 @@ void VPlanTransforms::addActiveLaneMask(
HeaderMask->replaceAllUsesWith(LaneMask);
}
+/// Replace recipes with their EVL variants.
+static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+ SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+ for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
+ for (VPUser *U : collectUsersRecursively(HeaderMask)) {
+ auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
+ if (!CurRecipe)
+ continue;
+ auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
+ assert(OrigMask && "Unmasked recipe when folding tail");
+ return HeaderMask == OrigMask ? nullptr : OrigMask;
+ };
+
+ VPRecipeBase *NewRecipe =
+ TypeSwitch<VPRecipeBase *, VPRecipeBase *>(CurRecipe)
+ .Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
+ VPValue *NewMask = GetNewMask(L->getMask());
+ return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+ })
+ .Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
+ VPValue *NewMask = GetNewMask(S->getMask());
+ return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+ })
+ .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
+ unsigned Opcode = W->getOpcode();
+ if (!Instruction::isBinaryOp(Opcode) &&
+ !Instruction::isUnaryOp(Opcode))
+ return nullptr;
+ return new VPWidenEVLRecipe(*W, EVL);
+ })
+ .Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
+ VPValue *NewMask = GetNewMask(Red->getCondOp());
+ return new VPReductionEVLRecipe(*Red, EVL, NewMask);
+ })
+ .Default([&](VPRecipeBase *R) { return nullptr; });
+
+ if (!NewRecipe)
+ continue;
+
+ [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
+ assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
+ "New recipe must define the same number of values as the "
+ "original.");
+ assert(
+ NumDefVal <= 1 &&
+ "Only supports recipes with a single definition or without users.");
+ NewRecipe->insertBefore(CurRecipe);
+ if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
+ VPValue *CurVPV = CurRecipe->getVPSingleValue();
+ CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
+ }
+ CurRecipe->eraseFromParent();
+ }
+ recursivelyDeleteDeadRecipes(HeaderMask);
+ }
+}
+
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
@@ -1385,48 +1444,8 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
NextEVLIV->insertBefore(CanonicalIVIncrement);
EVLPhi->addOperand(NextEVLIV);
- for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
- for (VPUser *U : collectUsersRecursively(HeaderMask)) {
- VPRecipeBase *NewRecipe = nullptr;
- auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
- if (!CurRecipe)
- continue;
-
- auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
- assert(OrigMask && "Unmasked recipe when folding tail");
- return HeaderMask == OrigMask ? nullptr : OrigMask;
- };
- if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
- VPValue *NewMask = GetNewMask(MemR->getMask());
- if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
- NewRecipe = new VPWidenLoadEVLRecipe(*L, *VPEVL, NewMask);
- else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
- NewRecipe = new VPWidenStoreEVLRecipe(*S, *VPEVL, NewMask);
- else
- llvm_unreachable("unsupported recipe");
- } else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
- NewRecipe = new VPReductionEVLRecipe(*RedR, *VPEVL,
- GetNewMask(RedR->getCondOp()));
- }
+ transformRecipestoEVLRecipes(Plan, *VPEVL);
- if (NewRecipe) {
- [[maybe_unused]] unsigned NumDefVal = NewRecipe->getNumDefinedValues();
- assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
- "New recipe must define the same number of values as the "
- "original.");
- assert(
- NumDefVal <= 1 &&
- "Only supports recipes with a single definition or without users.");
- NewRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe>(NewRecipe)) {
- VPValue *CurVPV = CurRecipe->getVPSingleValue();
- CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
- }
- CurRecipe->eraseFromParent();
- }
- }
- recursivelyDeleteDeadRecipes(HeaderMask);
- }
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 452c977106a773..b8b2c0bd4d5ff1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -356,6 +356,7 @@ class VPDef {
VPWidenStoreEVLSC,
VPWidenStoreSC,
VPWidenSC,
+ VPWidenEVLSC,
VPWidenSelectSC,
VPBlendSC,
// START: Phi-like recipes. Need to be kept together.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index deb9f0f9bb7e07..0381f6dae9811f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -385,6 +385,67 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ]
; IF-EVL-INLOOP-NEXT: ret i32 [[SMIN_LCSSA]]
;
+; IF-EVL-LABEL: @smin(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
+; IF-EVL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[START:%.*]], i64 0
+; IF-EVL-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP11:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP10]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP11]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP9]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP16]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SMIN:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP20]], [[RDX]]
+; IF-EVL-NEXT: [[SMIN]] = select i1 [[CMP_I]], i32 [[TMP20]], i32 [[RDX]]
+; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL: for.end:
+; IF-EVL-NEXT: [[SMIN_LCSSA:%.*]] = phi i32 [ [[SMIN]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; IF-EVL-NEXT: ret i32 [[SMIN_LCSSA]]
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
new file mode 100644
index 00000000000000..501e27d73737c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll
@@ -0,0 +1,1763 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=IF-EVL
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s --check-prefix=NO-VP
+
+
+define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_and(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.and.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = and i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_and(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = and i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = and i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_or(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.or.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = or i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_or(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = or i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = or i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_xor(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = xor i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_xor(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = xor i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = xor i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_shl(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.shl.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = shl i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_shl(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = shl i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = shl i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_lshr(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.lshr.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = lshr i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_lshr(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = lshr i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = lshr i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_ashr(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.ashr.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = ashr i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_ashr(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = ashr i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = ashr i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_add(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = add i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_add(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = add i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = add i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_sub(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.sub.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = sub i8 [[TMP20]], 1
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_sub(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = sub i8 [[TMP0]], 1
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = sub i8 %0, 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_mul(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = mul i8 [[TMP20]], 3
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_mul(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = mul i8 [[TMP0]], 3
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = mul i8 %0, 3
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_sdiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.sdiv.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = sdiv i8 [[TMP20]], 3
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_sdiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = sdiv i8 [[TMP0]], 3
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = sdiv i8 %0, 3
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_udiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.udiv.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = udiv i8 [[TMP20]], 3
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_udiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = udiv i8 [[TMP0]], 3
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = udiv i8 %0, 3
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_srem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.srem.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = srem i8 [[TMP20]], 3
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_srem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = srem i8 [[TMP0]], 3
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = srem i8 %0, 3
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_urem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP1]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16
+; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP7]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP6]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP10]], i32 16, i1 true)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.vp.urem.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 3, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP15]], ptr align 1 [[TMP17]], <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), i32 [[TMP11]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; IF-EVL-NEXT: [[TMP:%.*]] = urem i8 [[TMP20]], 3
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_urem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; NO-VP-NEXT: [[TMP:%.*]] = urem i8 [[TMP0]], 3
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %len
+ %0 = load i8, ptr %arrayidx, align 1
+ %tmp = urem i8 %0, 3
+ %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 %len
+ store i8 %tmp, ptr %arrayidx1, align 1
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+; Floating point tests
+
+define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fadd(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fadd.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = fadd fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_fadd(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = fadd fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = fadd fast float %0, 3.000000e+00
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fsub(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fsub.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = fsub fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_fsub(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = fsub fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = fsub fast float %0, 3.000000e+00
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fmul(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fmul.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = fmul fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP33:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_fmul(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = fmul fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = fmul fast float %0, 3.000000e+00
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fdiv(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fdiv.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = fdiv fast float [[TMP21]], 3.000000e+00
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP35:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_fdiv(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = fdiv fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = fdiv fast float %0, 3.000000e+00
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_frem(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_frem(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_frem(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = frem fast float [[TMP0]], 3.000000e+00
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = frem fast float %0, 3.000000e+00
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
+
+define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) {
+; IF-EVL-LABEL: define void @test_fneg(
+; IF-EVL-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; IF-EVL-NEXT: [[LOOP_PREHEADER:.*]]:
+; IF-EVL-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; IF-EVL-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; IF-EVL-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IF-EVL: [[VECTOR_MEMCHECK]]:
+; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = sub i64 [[B1]], [[A2]]
+; IF-EVL-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
+; IF-EVL-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IF-EVL: [[VECTOR_PH]]:
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 100, [[TMP8]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: br label %[[VECTOR_BODY:.*]]
+; IF-EVL: [[VECTOR_BODY]]:
+; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 100, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = call fast <vscale x 4 x float> @llvm.vp.fneg.nxv4f32(<vscale x 4 x float> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4f32.p0(<vscale x 4 x float> [[TMP16]], ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; IF-EVL: [[MIDDLE_BLOCK]]:
+; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; IF-EVL: [[SCALAR_PH]]:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; IF-EVL-NEXT: br label %[[LOOP:.*]]
+; IF-EVL: [[LOOP]]:
+; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; IF-EVL-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP:%.*]] = fneg fast float [[TMP21]]
+; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
+; IF-EVL: [[FINISH_LOOPEXIT]]:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: define void @test_fneg(
+; NO-VP-SAME: ptr nocapture [[A:%.*]], ptr nocapture readonly [[B:%.*]]) #[[ATTR0]] {
+; NO-VP-NEXT: [[LOOP_PREHEADER:.*]]:
+; NO-VP-NEXT: br label %[[LOOP:.*]]
+; NO-VP: [[LOOP]]:
+; NO-VP-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; NO-VP-NEXT: [[DEC]] = add nsw i64 [[LEN]], 1
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[LEN]]
+; NO-VP-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[TMP:%.*]] = fneg fast float [[TMP0]]
+; NO-VP-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]]
+; NO-VP-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4
+; NO-VP-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100
+; NO-VP-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT:.*]], label %[[LOOP]]
+; NO-VP: [[FINISH_LOOPEXIT]]:
+; NO-VP-NEXT: ret void
+;
+loop.preheader:
+ br label %loop
+
+loop:
+ %len = phi i64 [ %dec, %loop ], [ 0, %loop.preheader ]
+ %dec = add nsw i64 %len, 1
+ %arrayidx = getelementptr inbounds float, ptr %a, i64 %len
+ %0 = load float, ptr %arrayidx, align 4
+ %tmp = fneg fast float %0
+ %arrayidx1 = getelementptr inbounds float, ptr %b, i64 %len
+ store float %tmp, ptr %arrayidx1, align 4
+ %.not = icmp eq i64 %dec, 100
+ br i1 %.not, label %finish.loopexit, label %loop
+
+finish.loopexit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
index b8b2558247fa64..99da5058fbf922 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -20,43 +20,43 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; IF-EVL-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; IF-EVL-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true)
-; IF-EVL-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0
+; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[EVL_BASED_IV]], 0
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP19:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
-; IF-EVL-NEXT: [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP23:%.*]] = add <vscale x 4 x i32> [[VP_OP_LOAD]], [[VP_OP_LOAD3]]
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr align 4 [[TMP22]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP24:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP24]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
+; IF-EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD3]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
+; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
; IF-EVL: for.body:
; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP26]], 0
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0
; IF-EVL-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; IF-EVL: if.then:
; IF-EVL-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT: [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
; IF-EVL-NEXT: br label [[FOR_INC]]
; IF-EVL: for.inc:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index c5a89d48f77b0b..c1cf8b0fc541e7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -16,46 +16,46 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], -1
-; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP13]]
-; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP13]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4
-; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 0, [[TMP20]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP20]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP24]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[TMP25:%.*]] = zext i32 [[TMP8]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP25]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; IF-EVL-NEXT: [[TMP12:%.*]] = mul i64 0, [[TMP11]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = sub i64 1, [[TMP11]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP8]]
+; IF-EVL-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 0, [[TMP18]]
+; IF-EVL-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[TMP19]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP20]]
+; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP22]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -119,61 +119,61 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; IF-EVL-NEXT: [[TMP4:%.*]] = sub i64 [[TMP1]], 1
-; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP2]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]]
; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32
-; IF-EVL-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT: [[TMP7:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP7]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP5]], i32 4, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32
-; IF-EVL-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX3]], 0
+; IF-EVL-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX3]], 0
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP11:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; IF-EVL-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP12]]
-; IF-EVL-NEXT: [[TMP13:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT: [[TMP14:%.*]] = add i64 [[TMP9]], -1
-; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP10]]
-; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP21]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP21]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP22]]
-; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP8]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP14]]
-; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4
-; IF-EVL-NEXT: [[TMP29:%.*]] = mul i64 0, [[TMP28]]
-; IF-EVL-NEXT: [[TMP30:%.*]] = sub i64 1, [[TMP28]]
-; IF-EVL-NEXT: [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]]
-; IF-EVL-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP32]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP8]])
-; IF-EVL-NEXT: [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; IF-EVL-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
+; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP10]]
+; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT: [[TMP12:%.*]] = add i64 [[TMP7]], -1
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
+; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP15:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; IF-EVL-NEXT: [[TMP20:%.*]] = mul i64 0, [[TMP19]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = sub i64 1, [[TMP19]]
+; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]]
+; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; IF-EVL-NEXT: [[TMP27:%.*]] = mul i64 0, [[TMP26]]
+; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP26]]
+; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]]
+; IF-EVL-NEXT: [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]]
+; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP30]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP6]])
+; IF-EVL-NEXT: [[TMP31:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; IF-EVL-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
index 362bfd61ebd076..83f7dc3702b086 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -39,15 +39,15 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP13]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
; IF-EVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
-; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
-; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP18]], ptr align 4 [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64
-; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD1]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP13]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP19]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP12]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL: middle.block:
; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; IF-EVL: scalar.ph:
@@ -56,10 +56,10 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL: for.body:
; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
-; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
-; IF-EVL-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP23]]
+; IF-EVL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP22]]
; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index 8caa9368bfde18..04b3ba52cbefc6 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -31,7 +31,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT: WIDEN-VP ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>, vp<[[EVL]]>
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
More information about the llvm-commits
mailing list