[llvm] [LV][EVL] Generate negative strided load/store for reversed load/store (PR #123608)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 20 05:01:11 PST 2025
https://github.com/wangpc-pp created https://github.com/llvm/llvm-project/pull/123608
This can reduce the operations to reverse mask, load result
and store value.
>From 3a6a22634276601b38d9d3e766551eb37736fbe4 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Mon, 20 Jan 2025 20:56:18 +0800
Subject: [PATCH] [LV][EVL] Generate negative strided load/store for reversed
load/store
This can reduce the operations to reverse mask, load result
and store value.
---
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 66 +++++++++++--------
...-force-tail-with-evl-reverse-load-store.ll | 23 ++-----
...orize-force-tail-with-evl-uniform-store.ll | 3 +-
3 files changed, 46 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa5f92b235555e..587c7e9b4417fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
auto *LI = cast<LoadInst>(&Ingredient);
@@ -2630,8 +2619,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
@@ -2641,17 +2628,29 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Load, DataTy, Addr, "vp.op.load"));
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ Value *Operands[] = {
+ Addr,
+ ConstantInt::getSigned(
+ Builder.getInt32Ty(),
+ -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+ Mask, EVL};
+ NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+ {DataTy, PtrTy, Builder.getInt32Ty()},
+ Operands, nullptr, "vp.neg.strided.load");
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Load, DataTy, Addr, "vp.op.load"));
+ }
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}
@@ -2749,13 +2748,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
@@ -2765,11 +2760,26 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isReverse()) {
+ Type *StoredValTy = StoredVal->getType();
+ auto *EltTy = cast<VectorType>(StoredValTy)->getElementType();
+ auto *PtrTy = Addr->getType();
+ Value *Operands[] = {
+ StoredVal, Addr,
+ ConstantInt::getSigned(
+ Builder.getInt32Ty(),
+ -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+ Mask, EVL};
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands);
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5b579b0749c677..ba65137e94935c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -39,16 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -153,18 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -280,8 +274,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
@@ -290,16 +283,14 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index a2f85b9ed4ffe1..69ba0bad45de6a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -43,8 +43,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP20]], i32 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
More information about the llvm-commits
mailing list