[llvm] [LV][EVL] Support call instruction with EVL-vectorization (PR #110412)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 17 03:34:56 PDT 2024
https://github.com/LiqinWeng updated https://github.com/llvm/llvm-project/pull/110412
>From 26111326f544fca6b515167c6f463744d50b82c0 Mon Sep 17 00:00:00 2001
From: LiqinWeng <liqin.weng at spacemit.com>
Date: Sat, 28 Sep 2024 16:44:48 +0800
Subject: [PATCH 1/2] [LV][EVL][Test] Prepare test for adding intrinsics of
call Recipe
---
.../RISCV/vplan-vp-call-intrinsics.ll | 223 ++++++++++++++++++
1 file changed, 223 insertions(+)
create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
new file mode 100644
index 00000000000000..4970f6ac34928b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -0,0 +1,223 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+define void @vp_smax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT: vector.body:
+; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.smax(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT: No successors
+; IF-EVL-NEXT: }
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %. = tail call i32 @llvm.smax.i32(i32 %0, i32 %1)
+ %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %., ptr %arrayidx11, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+define void @vp_smin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT: vector.body:
+; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.smin(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT: No successors
+; IF-EVL-NEXT: }
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %. = tail call i32 @llvm.smin.i32(i32 %0, i32 %1)
+ %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %., ptr %arrayidx11, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+define void @vp_umax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT: vector.body:
+; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.umax(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT: No successors
+; IF-EVL-NEXT: }
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %. = tail call i32 @llvm.umax.i32(i32 %0, i32 %1)
+ %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %., ptr %arrayidx11, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+define void @vp_umin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+
+; IF-EVL: <x1> vector loop: {
+; IF-EVL-NEXT: vector.body:
+; IF-EVL-NEXT: EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT: EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT: EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+; IF-EVL-NEXT: vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+; IF-EVL-NEXT: CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.umin(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
+; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
+; IF-EVL-NEXT: SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+; IF-EVL-NEXT: EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT: EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
+; IF-EVL-NEXT: No successors
+; IF-EVL-NEXT: }
+
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx3, align 4
+ %. = tail call i32 @llvm.umin.i32(i32 %0, i32 %1)
+ %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+ store i32 %., ptr %arrayidx11, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+ br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
\ No newline at end of file
>From 35d8632ac860c35f8baba04b4623e89e802811f8 Mon Sep 17 00:00:00 2001
From: LiqinWeng <liqin.weng at spacemit.com>
Date: Wed, 16 Oct 2024 17:01:58 +0800
Subject: [PATCH 2/2] [LV][EVL] Support call instruction with EVL-vectorization
---
llvm/include/llvm/Analysis/VectorUtils.h | 5 +
llvm/include/llvm/IR/VectorBuilder.h | 4 +-
llvm/lib/Analysis/VectorUtils.cpp | 9 ++
llvm/lib/IR/VectorBuilder.cpp | 9 +-
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 8 ++
llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 1 -
llvm/lib/Transforms/Vectorize/VPlan.h | 14 +++
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 33 ++++--
.../Transforms/Vectorize/VPlanTransforms.cpp | 11 ++
.../RISCV/vplan-vp-call-intrinsics.ll | 112 +++++++++---------
11 files changed, 135 insertions(+), 75 deletions(-)
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index e2dd4976f39065..f2fdf6564b723f 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -160,6 +160,11 @@ bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx);
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI);
+/// Returns VP intrinsic ID for call.
+/// For the input call instruction it finds mapping intrinsic and returns
+/// its intrinsic ID, in case it does not found it return not_intrinsic.
+Intrinsic::ID getVPIntrinsicIDForCall(const CallInst *CI);
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
index b0277c2b52595e..31a64eddada877 100644
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ b/llvm/include/llvm/IR/VectorBuilder.h
@@ -99,11 +99,11 @@ class VectorBuilder {
const Twine &Name = Twine());
/// Emit a VP reduction intrinsic call for recurrence kind.
- /// \param RdxID The intrinsic ID of llvm.vector.reduce.*
+ /// \param ID The intrinsic ID of call Intrinsic
/// \param ValTy The type of operand which the reduction operation is
/// performed.
/// \param VecOpArray The operand list.
- Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
+ Value *createSimpleIntrinsic(Intrinsic::ID RdxID, Type *ValTy,
ArrayRef<Value *> VecOpArray,
const Twine &Name = Twine());
};
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index dbffbb8a5f81d9..70e0a15ac414e2 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -169,6 +169,15 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
return Intrinsic::not_intrinsic;
}
+Intrinsic::ID llvm::getVPIntrinsicIDForCall(const CallInst *CI) {
+ const Function *F = CI->getCalledFunction();
+ if (!F)
+ return Intrinsic::not_intrinsic;
+
+ if (F->isIntrinsic())
+ return VPIntrinsic::getForIntrinsic(F->getIntrinsicID());
+}
+
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
index 737f49b1334d76..d629a2fb6af7b3 100644
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ b/llvm/lib/IR/VectorBuilder.cpp
@@ -60,13 +60,12 @@ Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
}
-Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
- Type *ValTy,
+Value *VectorBuilder::createSimpleIntrinsic(Intrinsic::ID ID, Type *ValTy,
ArrayRef<Value *> InstOpArray,
const Twine &Name) {
- auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
- assert(VPReductionIntrinsic::isVPReduction(VPID) &&
- "No VPIntrinsic for this reduction");
+ auto VPID = VPIntrinsic::getForIntrinsic(ID);
+ assert(VPIntrinsic::isVPIntrinsic(VPID) &&
+ "No VPIntrinsic for this Intrinsic");
return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index cba73abdd15028..20b26fdacf8642 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1073,6 +1073,14 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return getArithmeticInstrCost(*FOp, ICA.getReturnType(), CostKind);
break;
}
+ // TODO: Need push a new patch
+ case Intrinsic::vp_smax:
+ case Intrinsic::vp_smin:
+ case Intrinsic::vp_umax:
+ case Intrinsic::vp_umin: {
+ // return LT.first;
+ return 1;
+ }
// vp int cast ops.
case Intrinsic::vp_trunc:
case Intrinsic::vp_zext:
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 70047273c3b9af..2dac2d43f7f3a3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1300,7 +1300,7 @@ Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
Type *SrcEltTy = SrcTy->getElementType();
Value *Iden = getRecurrenceIdentity(Kind, SrcEltTy, Desc.getFastMathFlags());
Value *Ops[] = {Iden, Src};
- return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+ return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}
Value *llvm::createReduction(IRBuilderBase &B,
@@ -1343,7 +1343,7 @@ Value *llvm::createOrderedReduction(VectorBuilder &VBuilder,
Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
auto *SrcTy = cast<VectorType>(Src->getType());
Value *Ops[] = {Start, Src};
- return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+ return VBuilder.createSimpleIntrinsic(Id, SrcTy, Ops);
}
void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8bf92f3480620a..9cabd5aceea804 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8351,7 +8351,6 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
return nullptr;
SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));
-
// Is it beneficial to perform intrinsic call compared to lib call?
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4cef47e69f0e3b..3e6b08de8bdbe4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1708,6 +1708,20 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags {
/// Returns true if the intrinsic may write to memory.
bool mayWriteToMemory() const { return MayWriteToMemory; }
+ operand_range arg_operands() {
+ unsigned argNum = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)
+ ? getNumOperands() - 1
+ : getNumOperands();
+ return make_range(op_begin(), op_begin() + argNum);
+ }
+
+ const_operand_range arg_operands() const {
+ unsigned argNum = VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)
+ ? getNumOperands() - 1
+ : getNumOperands();
+ return make_range(op_begin(), op_begin() + argNum);
+ }
+
/// Returns true if the intrinsic may have side-effects.
bool mayHaveSideEffects() const { return MayHaveSideEffects; }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6fe30356e8c912..2d5d69ca21b2e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -970,7 +970,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
SmallVector<Value *, 4> Args;
- for (const auto &I : enumerate(operands())) {
+ for (const auto &I : enumerate(arg_operands())) {
// Some intrinsics have a scalar argument - don't replace it with a
// vector.
Value *Arg;
@@ -983,18 +983,33 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
Args.push_back(Arg);
}
- // Use vector version of the intrinsic.
- Module *M = State.Builder.GetInsertBlock()->getModule();
- Function *VectorF =
- Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
- assert(VectorF && "Can't retrieve vector intrinsic.");
-
+ CallInst *V = nullptr;
auto *CI = cast_or_null<CallInst>(getUnderlyingValue());
SmallVector<OperandBundleDef, 1> OpBundles;
if (CI)
CI->getOperandBundlesAsDefs(OpBundles);
- CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
+ if (VPIntrinsic::isVPIntrinsic(VectorIntrinsicID)) {
+ // Use vector version of the vector predicate Intrinsic
+ IRBuilderBase &BuilderIR = State.Builder;
+ VectorBuilder VBuilder(BuilderIR);
+ Value *Mask = BuilderIR.CreateVectorSplat(State.VF, BuilderIR.getTrue());
+ VBuilder.setMask(Mask).setEVL(
+ State.get(getOperand(getNumOperands() - 1), /*NeedsScalar=*/true));
+ auto *TyReturn = VectorType::get(getResultType(), State.VF);
+ Value *VPInst = VBuilder.createSimpleIntrinsic(VectorIntrinsicID, TyReturn,
+ Args, "vp.call");
+ if (VPInst) {
+ V = cast<CallInst>(VPInst);
+ }
+ } else {
+ // Use vector version of the intrinsic.
+ Module *M = State.Builder.GetInsertBlock()->getModule();
+ Function *VectorF =
+ Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl);
+ assert(VectorF && "Can't retrieve vector intrinsic.");
+ V = State.Builder.CreateCall(VectorF, Args, OpBundles);
+ }
setFlags(V);
@@ -1013,7 +1028,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
// clear Arguments.
// TODO: Rework TTI interface to be independent of concrete IR values.
SmallVector<const Value *> Arguments;
- for (const auto &[Idx, Op] : enumerate(operands())) {
+ for (const auto &[Idx, Op] : enumerate(arg_operands())) {
auto *V = Op->getUnderlyingValue();
if (!V) {
if (auto *UI = dyn_cast_or_null<CallBase>(getUnderlyingValue())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index faec08cac18751..3d0c475a06b457 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1381,6 +1381,17 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
return nullptr;
return new VPWidenEVLRecipe(*W, EVL);
})
+ .Case<VPWidenIntrinsicRecipe>(
+ [&](VPWidenIntrinsicRecipe *CInst) -> VPRecipeBase * {
+ auto *CI = cast<CallInst>(CInst->getUnderlyingInstr());
+ SmallVector<VPValue *> Ops(CInst->operands());
+ Ops.push_back(&EVL);
+ Intrinsic::ID VPID = getVPIntrinsicIDForCall(CI);
+ if (VPID == Intrinsic::not_intrinsic)
+ return nullptr;
+ return new VPWidenIntrinsicRecipe(
+ *CI, VPID, Ops, CI->getType(), CI->getDebugLoc());
+ })
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
return new VPReductionEVLRecipe(*Red, EVL, NewMask);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
index 4970f6ac34928b..f4379b2d53348c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-call-intrinsics.ll
@@ -27,7 +27,7 @@ define void @vp_smax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.smax(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMAX:%.+]]> = call llvm.vp.smax(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMAX]]>, vp<[[EVL]]>
@@ -39,20 +39,20 @@ define void @vp_smax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }
entry:
- br label %for.body
-
-for.body:
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx3, align 4
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep = getelementptr inbounds i32, ptr %b, i64 %iv
+ %0 = load i32, ptr %gep, align 4
+ %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
+ %1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.smax.i32(i32 %0, i32 %1)
- %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
- store i32 %., ptr %arrayidx11, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
+ %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %., ptr %gep11, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %loop
exit:
ret void
@@ -80,7 +80,7 @@ define void @vp_smin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.smin(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[SMIN:%.+]]> = call llvm.vp.smin(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[SMIN]]>, vp<[[EVL]]>
@@ -92,20 +92,20 @@ define void @vp_smin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }
entry:
- br label %for.body
-
-for.body:
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx3, align 4
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep = getelementptr inbounds i32, ptr %b, i64 %iv
+ %0 = load i32, ptr %gep, align 4
+ %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
+ %1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.smin.i32(i32 %0, i32 %1)
- %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
- store i32 %., ptr %arrayidx11, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
+ %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %., ptr %gep11, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %loop
exit:
ret void
@@ -133,7 +133,7 @@ define void @vp_umax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.umax(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMAX:%.+]]> = call llvm.vp.umax(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMAX]]>, vp<[[EVL]]>
@@ -145,20 +145,20 @@ define void @vp_umax(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }
entry:
- br label %for.body
-
-for.body:
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx3, align 4
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep = getelementptr inbounds i32, ptr %b, i64 %iv
+ %0 = load i32, ptr %gep, align 4
+ %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
+ %1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.umax.i32(i32 %0, i32 %1)
- %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
- store i32 %., ptr %arrayidx11, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
+ %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %., ptr %gep11, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %loop
exit:
ret void
@@ -186,7 +186,7 @@ define void @vp_umin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
-; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.umin(ir<[[LD1]]>, ir<[[LD2]]>)
+; IF-EVL-NEXT: WIDEN-INTRINSIC ir<[[UMIN:%.+]]> = call llvm.vp.umin(ir<[[LD1]]>, ir<[[LD2]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
; IF-EVL-NEXT: vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
; IF-EVL-NEXT: WIDEN vp.store vp<[[PTR3]]>, ir<[[UMIN]]>, vp<[[EVL]]>
@@ -198,20 +198,20 @@ define void @vp_umin(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
; IF-EVL-NEXT: }
entry:
- br label %for.body
-
-for.body:
- %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
- %0 = load i32, ptr %arrayidx, align 4
- %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
- %1 = load i32, ptr %arrayidx3, align 4
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep = getelementptr inbounds i32, ptr %b, i64 %iv
+ %0 = load i32, ptr %gep, align 4
+ %gep3 = getelementptr inbounds i32, ptr %c, i64 %iv
+ %1 = load i32, ptr %gep3, align 4
%. = tail call i32 @llvm.umin.i32(i32 %0, i32 %1)
- %arrayidx11 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
- store i32 %., ptr %arrayidx11, align 4
- %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
- %exitcond.not = icmp eq i64 %indvars.iv.next, %N
- br i1 %exitcond.not, label %exit, label %for.body
+ %gep11 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %., ptr %gep11, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, %N
+ br i1 %exitcond.not, label %exit, label %loop
exit:
ret void
More information about the llvm-commits
mailing list