[llvm] [LV][EVL]Support reversed loads/stores. (PR #88025)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Wed May 1 04:11:39 PDT 2024


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/88025

>From ae89a26717642c5f46e378879c70d1c7d3b1dfcf Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 8 Apr 2024 17:56:25 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 67 +++++++++++++------
 ...-force-tail-with-evl-reverse-load-store.ll | 23 +++----
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9e22dce384773e..797a3fd1e9dba4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1579,13 +1579,7 @@ class LoopVectorizationCostModel {
   /// Returns true if VP intrinsics with explicit vector length support should
   /// be generated in the tail folded loop.
   bool foldTailWithEVL() const {
-    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
-           // FIXME: remove this once vp_reverse is supported.
-           none_of(
-               WideningDecisions,
-               [](const std::pair<std::pair<Instruction *, ElementCount>,
-                                  std::pair<InstWidening, InstructionCost>>
-                      &Data) { return Data.second.first == CM_Widen_Reverse; });
+    return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
   }
 
   /// Returns true if the Phi is part of an inloop reduction.
@@ -9361,10 +9355,17 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
 
 /// Creates either vp_store or vp_scatter intrinsics calls to represent
 /// predicated store/scatter.
-static Instruction *
-lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
-                                Value *StoredVal, bool IsScatter, Value *Mask,
-                                Value *EVL, const Align &Alignment) {
+static Instruction *lowerStoreUsingVectorIntrinsics(
+    IRBuilderBase &Builder, Value *Addr, Value *StoredVal, bool IsScatter,
+    bool IsReverse, Value *Mask, Value *EVL, const Align &Alignment) {
+  if (IsReverse) {
+    auto *StoredValTy = cast<VectorType>(StoredVal->getType());
+    Value *BlockInMaskPart =
+        Builder.getAllOnesMask(StoredValTy->getElementCount());
+    StoredVal = Builder.CreateIntrinsic(
+        StoredValTy, Intrinsic::experimental_vp_reverse,
+        {StoredVal, BlockInMaskPart, EVL}, nullptr, "vp.reverse");
+  }
   CallInst *Call;
   if (IsScatter) {
     Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
@@ -9384,11 +9385,9 @@ lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr,
 
 /// Creates either vp_load or vp_gather intrinsics calls to represent
 /// predicated load/gather.
-static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
-                                                   VectorType *DataTy,
-                                                   Value *Addr, bool IsGather,
-                                                   Value *Mask, Value *EVL,
-                                                   const Align &Alignment) {
+static Instruction *lowerLoadUsingVectorIntrinsics(
+    IRBuilderBase &Builder, VectorType *DataTy, Value *Addr, bool IsGather,
+    bool IsReverse, Value *Mask, Value *EVL, const Align &Alignment) {
   CallInst *Call;
   if (IsGather) {
     Call =
@@ -9402,7 +9401,14 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
   }
   Call->addParamAttr(
       0, Attribute::getWithAlignment(Call->getContext(), Alignment));
-  return Call;
+  Instruction *Res = Call;
+  if (IsReverse) {
+    Value *BlockInMaskPart = Builder.getAllOnesMask(DataTy->getElementCount());
+    Res = Builder.CreateIntrinsic(DataTy, Intrinsic::experimental_vp_reverse,
+                                  {Res, BlockInMaskPart, EVL}, nullptr,
+                                  "vp.reverse");
+  }
+  return Res;
 }
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
@@ -9430,7 +9436,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
     // a null all-one mask is a null mask.
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Value *Mask = State.get(getMask(), Part);
-      if (isReverse())
+      if (isReverse() && !State.EVL)
         Mask = Builder.CreateVectorReverse(Mask, "reverse");
       BlockInMaskParts[Part] = Mask;
     }
@@ -9456,11 +9462,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         // is created only if TTI prefers predicated vectorization, thus if EVL
         // is not nullptr it also implies preference for predicated
         // vectorization.
-        // FIXME: Support reverse store after vp_reverse is added.
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        if (isMaskRequired && isReverse() && !getMask()->isLiveIn()) {
+          VectorType *MaskTy = cast<VectorType>(MaskPart->getType());
+          Value *BlockInMaskPart =
+              Builder.getAllOnesMask(MaskTy->getElementCount());
+          MaskPart = Builder.CreateIntrinsic(
+              MaskTy, Intrinsic::experimental_vp_reverse,
+              {MaskPart, BlockInMaskPart, EVL}, nullptr, "vp.reverse.mask");
+          BlockInMaskParts[Part] = MaskPart;
+        }
         NewSI = lowerStoreUsingVectorIntrinsics(
             Builder, State.get(getAddr(), Part, !CreateGatherScatter),
-            StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
+            StoredVal, CreateGatherScatter, isReverse(), MaskPart, EVL,
+            Alignment);
       } else if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
@@ -9504,11 +9519,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
       // is created only if TTI prefers predicated vectorization, thus if EVL
       // is not nullptr it also implies preference for predicated
       // vectorization.
-      // FIXME: Support reverse loading after vp_reverse is added.
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      if (isMaskRequired && isReverse() && !getMask()->isLiveIn()) {
+        VectorType *MaskTy = cast<VectorType>(MaskPart->getType());
+        Value *BlockInMaskPart =
+            Builder.getAllOnesMask(MaskTy->getElementCount());
+        MaskPart = Builder.CreateIntrinsic(
+            MaskTy, Intrinsic::experimental_vp_reverse,
+            {MaskPart, BlockInMaskPart, EVL}, nullptr, "vp.reverse.mask");
+        BlockInMaskParts[Part] = MaskPart;
+      }
       NewLI = lowerLoadUsingVectorIntrinsics(
           Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
-          CreateGatherScatter, MaskPart, EVL, Alignment);
+          CreateGatherScatter, isReverse(), MaskPart, EVL, Alignment);
     } else if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index f2222e0a1f936a..f839eafe9b2a61 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -30,14 +30,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[INDEX]]
+; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; IF-EVL-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
-; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; IF-EVL-NEXT:    [[TMP10:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = add i64 [[TMP7]], -1
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
@@ -46,9 +43,8 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = sub i64 1, [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP15]]
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP16]]
-; IF-EVL-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP18]], i32 4, <vscale x 4 x i1> [[REVERSE]], <vscale x 4 x i32> poison)
-; IF-EVL-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
@@ -56,9 +52,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP21]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP10]])
-; IF-EVL-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[REVERSE3]])
-; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[REVERSE5]], ptr [[TMP25]], i32 4, <vscale x 4 x i1> [[REVERSE4]])
+; IF-EVL-NEXT:    [[TMP28:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP31]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP28]], ptr align 4 [[TMP25]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    [[TMP29:%.*]] = zext i32 [[TMP8]] to i64
+; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP29]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IF-EVL-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

>From 674a61570a383a361cb5740c802b2ff516942bb9 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 24 Apr 2024 18:21:16 +0000
Subject: [PATCH 2/2] Address comments

Created using spr 1.3.5
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        | 10 ++++++++++
 ...vectorize-force-tail-with-evl-reverse-load-store.ll |  3 ++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4ea301d15a7711..315e644ed4a814 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9475,6 +9475,16 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   Value *Mask = getMask()
                     ? State.get(getMask(), 0)
                     : Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+  if (isReverse() && getMask()) {
+    VectorType *MaskTy = cast<VectorType>(Mask->getType());
+    Mask = Builder.CreateIntrinsic(
+        MaskTy, Intrinsic::experimental_vp_reverse,
+        {Mask,
+         Builder.CreateVectorSplat(MaskTy->getElementCount(),
+                                   Builder.getTrue()),
+         EVL},
+        nullptr, "vp.reverse.mask");
+  }
   Value *Addr = State.get(getAddr(), 0, !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5f6310e8fda73a..c9b57361c0b983 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -171,7 +171,8 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[TMP26]], i64 [[TMP29]]
 ; IF-EVL-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i64 [[TMP30]]
 ; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP32]], <vscale x 4 x i1> [[TMP18]], i32 [[TMP8]])
+; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP8]])
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP32]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP8]])
 ; IF-EVL-NEXT:    [[TMP33:%.*]] = zext i32 [[TMP8]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP33]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]



More information about the llvm-commits mailing list