[llvm] [LV][EVL] Support icmp/fcmp instruction with EVL-vectorization (PR #108533)

Sat Oct 26 00:40:02 PDT 2024

https://github.com/LiqinWeng updated https://github.com/llvm/llvm-project/pull/108533

>From 506aa7def744260eb5d1fd8f910146285f89777b Mon Sep 17 00:00:00 2001
From: wengliqin <liqin.weng at spacemit.com>
Date: Fri, 13 Sep 2024 18:53:01 +0800
Subject: [PATCH 1/3] [LV][EVL] Support icmp/fcmp instruction with
 EVL-vectorization

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   2 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  28 +++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   3 +-
 ...rize-force-tail-with-evl-cond-reduction.ll |   2 +-
 ...ze-force-tail-with-evl-masked-loadstore.ll |  30 ++---
 ...-force-tail-with-evl-reverse-load-store.ll |  48 +++----
 .../RISCV/vplan-vp-cmp-intrinsics.ll          | 118 ++++++++++++++++++
 7 files changed, 189 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2afd02dae3a8b8..7a2e6aa39e501f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11711,7 +11711,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
           if (auto *FPMO = dyn_cast<FPMathOperator>(II))
             FMF = FPMO->getFastMathFlags();
           IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
-                                      FMF);
+                                      FMF, II);
           InstructionCost IntrCost =
               TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput);
           InstructionCost CallCost = TTI->getCallInstrCost(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0eb4f7c7c88cee..7eed9941470308 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1456,6 +1456,34 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
 void VPWidenEVLRecipe::execute(VPTransformState &State) {
   unsigned Opcode = getOpcode();
   // TODO: Support other opcodes
+  if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+    Value *Op1 = State.get(getOperand(0), 0);
+    Value *Op2 = State.get(getOperand(1), 0);
+    auto &Ctx = State.Builder.getContext();
+    Value *Pred = MetadataAsValue::get(
+        Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate())));
+
+    IRBuilderBase &BuilderIR = State.Builder;
+    VectorBuilder Builder(BuilderIR);
+    Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+    Builder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true));
+
+    VectorType *DataType = VectorType::get(Type::getInt1Ty(Ctx), State.VF);
+
+    Value *VPInst = Builder.createVectorInstruction(Opcode, DataType,
+                                                    {Op1, Op2, Pred}, "vp.op");
+    //  if (isa<FPMathOperator>(VPInst))
+    // setFlags(cast<Instruction>(VPInst));
+    if (VPInst) {
+      if (auto *VecOp = dyn_cast<CastInst>(VPInst))
+        VecOp->copyIRFlags(getUnderlyingInstr());
+    }
+    State.set(this, VPInst, 0);
+    State.addMetadata(VPInst,
+                      dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+    return;
+  }
+
   if (!Instruction::isBinaryOp(Opcode) && !Instruction::isUnaryOp(Opcode))
     llvm_unreachable("Unsupported opcode in VPWidenEVLRecipe::execute");
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 03c4110761ac6a..fe7ca5c871207b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1476,7 +1476,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
               .Case<VPWidenRecipe>([&](VPWidenRecipe *W) -> VPRecipeBase * {
                 unsigned Opcode = W->getOpcode();
                 if (!Instruction::isBinaryOp(Opcode) &&
-                    !Instruction::isUnaryOp(Opcode))
+                    !Instruction::isUnaryOp(Opcode) &&
+                    Opcode != Instruction::ICmp && Opcode != Instruction::FCmp)
                   return nullptr;
                 return new VPWidenEVLRecipe(*W, EVL);
               })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index fc12dd54f88dfa..f2ca2f8749525f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -282,7 +282,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), metadata !"sgt", <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i1> zeroinitializer
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> [[TMP20]], i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
index 99da5058fbf922..c915ffd9a80d91 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll
@@ -45,18 +45,18 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
-; IF-EVL-NEXT:    [[TMP17:%.*]] = icmp ne <vscale x 4 x i32> [[VP_OP_LOAD]], zeroinitializer
-; IF-EVL-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0
-; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD3]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP]], ptr align 4 [[TMP20]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP10]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, metadata !"ne", <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP17:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VP_OP]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP11]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP19]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[VP_OP4:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> [[VP_OP_LOAD3]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP10]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_OP4]], ptr align 4 [[TMP19]], <vscale x 4 x i1> [[TMP17]], i32 [[TMP10]])
+; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP10]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
@@ -65,13 +65,13 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; IF-EVL:       for.body:
 ; IF-EVL-NEXT:    [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP23]], 0
+; IF-EVL-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[TMP22]], 0
 ; IF-EVL-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; IF-EVL:       if.then:
 ; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_011]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
-; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP23]], [[TMP24]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add i32 [[TMP22]], [[TMP23]]
 ; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4
 ; IF-EVL-NEXT:    br label [[FOR_INC]]
 ; IF-EVL:       for.inc:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index c1cf8b0fc541e7..def9d8d4c19124 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -147,33 +147,33 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[TMP16:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> zeroinitializer
-; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; IF-EVL-NEXT:    [[TMP20:%.*]] = mul i64 0, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP21:%.*]] = sub i64 1, [[TMP19]]
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP17]], i64 [[TMP20]]
-; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP21]]
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP23]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 100, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), metadata !"slt", <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[VP_OP]], <vscale x 4 x i1> zeroinitializer
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 4
+; IF-EVL-NEXT:    [[TMP19:%.*]] = mul i64 0, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP20:%.*]] = sub i64 1, [[TMP18]]
+; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP19]]
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP20]]
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP22]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
-; IF-EVL-NEXT:    [[TMP27:%.*]] = mul i64 0, [[TMP26]]
-; IF-EVL-NEXT:    [[TMP28:%.*]] = sub i64 1, [[TMP26]]
-; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP27]]
-; IF-EVL-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[TMP29]], i64 [[TMP28]]
+; IF-EVL-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP12]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 4
+; IF-EVL-NEXT:    [[TMP26:%.*]] = mul i64 0, [[TMP25]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = sub i64 1, [[TMP25]]
+; IF-EVL-NEXT:    [[TMP28:%.*]] = getelementptr i32, ptr [[TMP23]], i64 [[TMP26]]
+; IF-EVL-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr [[TMP28]], i64 [[TMP27]]
 ; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP30]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP6]])
-; IF-EVL-NEXT:    [[TMP31:%.*]] = zext i32 [[TMP6]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP31]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP29]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP6]])
+; IF-EVL-NEXT:    [[TMP30:%.*]] = zext i32 [[TMP6]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP30]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; IF-EVL-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; IF-EVL:       middle.block:
 ; IF-EVL-NEXT:    br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]]
 ; IF-EVL:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
new file mode 100644
index 00000000000000..4557a3762595ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
@@ -0,0 +1,118 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT:   vector.body:
+; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN ir<[[ICMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN-CAST ir<[[ZEXT:%.+]]> = zext ir<[[ICMP]]> to i32
+; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ZEXT]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
+; IF-EVL-NEXT:   No successors
+; IF-EVL-NEXT: }
+
+entry:
+  %cmp12 = icmp sgt i64 %N, 0
+  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
+  %cmp4 = icmp sgt i32 %0, %1
+  %conv5 = zext i1 %cmp4 to i32
+  %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %conv5, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT:   vector.body:
+; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN ir<[[FCMP:%.+]]> = vp.fcmp ogt ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     WIDEN-CAST ir<[[UITOFP:%.+]]> = uitofp  ir<[[FCMP]]> to float
+; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UITOFP]]>, vp<[[EVL]]>
+; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
+; IF-EVL-NEXT:   No successors
+; IF-EVL-NEXT: }
+
+entry:
+  %cmp13 = icmp sgt i64 %N, 0
+  br i1 %cmp13, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
+  %1 = load float, ptr %arrayidx3, align 4
+  %cmp4 = fcmp ogt float %0, %1
+  %conv6 = uitofp i1 %cmp4 to float
+  %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  store float %conv6, ptr %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
\ No newline at end of file

>From 2347fb13c95a245969ac8cf901effc54e38d4447 Mon Sep 17 00:00:00 2001
From: "Liqin.Weng" <liqin.weng at spacemit.com>
Date: Wed, 16 Oct 2024 18:03:55 +0800
Subject: [PATCH 2/3] rebase and fix the comments

---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 16 ++----
 ...rize-force-tail-with-evl-cond-reduction.ll |  2 +-
 .../RISCV/vplan-vp-cmp-intrinsics.ll          | 56 +++++++++----------
 .../RISCV/vplan-vp-select-intrinsics.ll       |  2 +-
 4 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7eed9941470308..cbfbca4c5e8088 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1465,19 +1465,15 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) {
 
     IRBuilderBase &BuilderIR = State.Builder;
     VectorBuilder Builder(BuilderIR);
+
     Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
     Builder.setMask(Mask).setEVL(State.get(getEVL(), /*NeedsScalar=*/true));
-
-    VectorType *DataType = VectorType::get(Type::getInt1Ty(Ctx), State.VF);
-
-    Value *VPInst = Builder.createVectorInstruction(Opcode, DataType,
+    VectorType *RetType = VectorType::get(Type::getInt1Ty(Ctx), State.VF);
+    Value *VPInst = Builder.createVectorInstruction(Opcode, RetType,
                                                     {Op1, Op2, Pred}, "vp.op");
-    //  if (isa<FPMathOperator>(VPInst))
-    // setFlags(cast<Instruction>(VPInst));
-    if (VPInst) {
-      if (auto *VecOp = dyn_cast<CastInst>(VPInst))
-        VecOp->copyIRFlags(getUnderlyingInstr());
-    }
+    if (auto *VecOp = dyn_cast<CastInst>(VPInst))
+      VecOp->copyIRFlags(getUnderlyingInstr());
+
     State.set(this, VPInst, 0);
     State.addMetadata(VPInst,
                       dyn_cast_or_null<Instruction>(getUnderlyingValue()));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index f2ca2f8749525f..1166d4cdc76b3a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -69,7 +69,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
-; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> %vp.op.load, <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), metadata !"sgt", <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 %9)
 ; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
index 4557a3762595ff..67de3460069b92 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
@@ -40,25 +40,24 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT: }
 
 entry:
-  %cmp12 = icmp sgt i64 %N, 0
-  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+  br label %loop
 
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv
   %0 = load i32, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx3, align 4
+  %gep = getelementptr inbounds i32, ptr %c, i64 %iv
+  %1 = load i32, ptr %gep, align 4
   %cmp4 = icmp sgt i32 %0, %1
   %conv5 = zext i1 %cmp4 to i32
-  %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  %arrayidx7 = getelementptr inbounds i32, ptr %a, i64 %iv
   store i32 %conv5, ptr %arrayidx7, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
 }
 
 define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
@@ -96,23 +95,22 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT: }
 
 entry:
-  %cmp13 = icmp sgt i64 %N, 0
-  br i1 %cmp13, label %for.body, label %for.cond.cleanup
+  br label %loop
 
-for.cond.cleanup:
-  ret void
-
-for.body:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds float, ptr %b, i64 %indvars.iv
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %b, i64 %iv
   %0 = load float, ptr %arrayidx, align 4
-  %arrayidx3 = getelementptr inbounds float, ptr %c, i64 %indvars.iv
-  %1 = load float, ptr %arrayidx3, align 4
+  %gep = getelementptr inbounds float, ptr %c, i64 %iv
+  %1 = load float, ptr %gep, align 4
   %cmp4 = fcmp ogt float %0, %1
   %conv6 = uitofp i1 %cmp4 to float
-  %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  %arrayidx8 = getelementptr inbounds float, ptr %a, i64 %iv
   store float %conv6, ptr %arrayidx8, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
-}
\ No newline at end of file
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index 6d6cfb5e9d18ed..81ca2486588e52 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -27,7 +27,7 @@
  ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
  ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
  ; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
- ; IF-EVL-NEXT:     WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
+ ; IF-EVL-NEXT:     WIDEN ir<[[CMP:%.+]]> = vp.icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
  ; IF-EVL-NEXT:     WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
  ; IF-EVL-NEXT:     WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>)
  ; IF-EVL-NEXT:     WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>

>From f448fafe2174ed4493811a1fd048cff50342b551 Mon Sep 17 00:00:00 2001
From: "Liqin.Weng" <liqin.weng at spacemit.com>
Date: Fri, 25 Oct 2024 17:48:37 +0800
Subject: [PATCH 3/3] Rebase && fix the comments

---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp       | 10 +++++-----
 .../LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll   | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cbfbca4c5e8088..440d74dc259dc3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1457,8 +1457,8 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) {
   unsigned Opcode = getOpcode();
   // TODO: Support other opcodes
   if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
-    Value *Op1 = State.get(getOperand(0), 0);
-    Value *Op2 = State.get(getOperand(1), 0);
+    Value *Op1 = State.get(getOperand(0));
+    Value *Op2 = State.get(getOperand(1));
     auto &Ctx = State.Builder.getContext();
     Value *Pred = MetadataAsValue::get(
         Ctx, MDString::get(Ctx, CmpInst::getPredicateName(getPredicate())));
@@ -1471,10 +1471,10 @@ void VPWidenEVLRecipe::execute(VPTransformState &State) {
     VectorType *RetType = VectorType::get(Type::getInt1Ty(Ctx), State.VF);
     Value *VPInst = Builder.createVectorInstruction(Opcode, RetType,
                                                     {Op1, Op2, Pred}, "vp.op");
-    if (auto *VecOp = dyn_cast<CastInst>(VPInst))
-      VecOp->copyIRFlags(getUnderlyingInstr());
+    if (isa<FPMathOperator>(VPInst))
+      setFlags(cast<Instruction>(VPInst));
 
-    State.set(this, VPInst, 0);
+    State.set(this, VPInst);
     State.addMetadata(VPInst,
                       dyn_cast_or_null<Instruction>(getUnderlyingValue()));
     return;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
index 67de3460069b92..312bb75a176221 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-cmp-intrinsics.ll
@@ -17,7 +17,7 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL: <x1> vector loop: {
 ; IF-EVL-NEXT:   vector.body:
 ; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
 ; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
@@ -33,8 +33,8 @@ define void @vp_icmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ZEXT]]>, vp<[[EVL]]>
 ; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
 ; IF-EVL-NEXT:   No successors
 ; IF-EVL-NEXT: }
@@ -72,7 +72,7 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL: <x1> vector loop: {
 ; IF-EVL-NEXT:   vector.body:
 ; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEXT:%.+]]>
 ; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
@@ -88,8 +88,8 @@ define void @vp_fcmp(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:     vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[UITOFP]]>, vp<[[EVL]]>
 ; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
 ; IF-EVL-NEXT:   No successors
 ; IF-EVL-NEXT: }