[llvm] [RISCV] Allow non-loop invariant steps in RISCVGatherScatterLowering (PR #122244)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 9 02:28:17 PST 2025
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/122244
The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232)
In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length.
We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader.
I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.
>From 25c09785636175d8ef366c079c42b1758e78ec15 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 9 Jan 2025 18:17:28 +0800
Subject: [PATCH 1/2] Precommit test
---
.../CodeGen/RISCV/rvv/strided-load-store.ll | 58 +++++++++++++++++++
1 file changed, 58 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index b1ece9fa8272db..d14b189d6958b2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -398,3 +398,61 @@ define <vscale x 1 x i64> @vector_base_vector_offset(ptr %p, <vscale x 1 x i64>
declare i64 @llvm.vscale.i64()
declare void @llvm.masked.scatter.nxv1i64.nxv1p0(<vscale x 1 x i64>, <vscale x 1 x ptr>, i32, <vscale x 1 x i1>)
declare <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr>, i32, <vscale x 1 x i1>, <vscale x 1 x i64>)
+
+define <vscale x 1 x i64> @gather_loop_variant_step(ptr %a, i32 %len) {
+; CHECK-LABEL: @gather_loop_variant_step(
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
+; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
+; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
+; CHECK-NEXT: [[OFFSET:%.*]] = shl <vscale x 1 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], <vscale x 1 x i64> [[OFFSET]], i32 3
+; CHECK-NEXT: [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP1]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+; CHECK-NEXT: [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
+; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[EVL_ZEXT]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret <vscale x 1 x i64> [[ACCUM_NEXT]]
+;
+vector.ph:
+ %wide.trip.count = zext i32 %len to i64
+ %1 = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.ind = phi <vscale x 1 x i64> [ %1, %vector.ph ], [ %vec.ind.next, %vector.body ]
+ %accum = phi <vscale x 1 x i64> [ zeroinitializer, %vector.ph ], [ %accum.next, %vector.body ]
+
+ %elems = sub i64 %wide.trip.count, %index
+ %evl = call i32 @llvm.experimental.get.vector.length(i64 %elems, i32 1, i1 true)
+ %evl.zext = zext i32 %evl to i64
+
+ %offset = shl <vscale x 1 x i64> %vec.ind, splat (i64 4)
+ %2 = getelementptr inbounds %struct.foo, ptr %a, <vscale x 1 x i64> %offset, i32 3
+ %gather = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> %2, i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+ %accum.next = add <vscale x 1 x i64> %accum, %gather
+
+ %index.next = add nuw i64 %index, %evl.zext
+
+ %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %evl.zext, i64 0
+ %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+ %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+
+ %3 = icmp ne i64 %index.next, %wide.trip.count
+ br i1 %3, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup: ; preds = %vector.body
+ ret <vscale x 1 x i64> %accum.next
+}
>From ed3f4880e4b4485185248edd6fafbdfe248f7f40 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Thu, 9 Jan 2025 18:19:40 +0800
Subject: [PATCH 2/2] [RISCV] Allow non-loop invariant steps in
RISCVGatherScatterLowering
The motivation for this is to allow us to match strided accesses that are emitted from the loop vectorizer with EVL tail folding (see #122232)
In these loops the step isn't loop invariant and is based off of @llvm.experimental.get.vector.length.
We can relax this as long as we make sure to construct the updates after the definition inside the loop, instead of the preheader.
I presume the restriction was previously added so that the step would dominate the insertion point in the preheader. I can't think of why it wouldn't be safe to calculate it in the loop otherwise.
---
.../RISCV/RISCVGatherScatterLowering.cpp | 21 +++++++++++++------
.../rvv/fixed-vectors-strided-load-store.ll | 4 ++--
.../CodeGen/RISCV/rvv/strided-load-store.ll | 15 +++++++------
3 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index f1e974f973cbe7..872dc1556999a0 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -212,10 +212,6 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
assert(Phi->getIncomingValue(IncrementingBlock) == Inc &&
"Expected one operand of phi to be Inc");
- // Only proceed if the step is loop invariant.
- if (!L->isLoopInvariant(Step))
- return false;
-
// Step should be a splat.
Step = getSplatValue(Step);
if (!Step)
@@ -311,18 +307,31 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
}
case Instruction::Mul: {
Start = Builder.CreateMul(Start, SplatOp, "start");
- Step = Builder.CreateMul(Step, SplatOp, "step");
Stride = Builder.CreateMul(Stride, SplatOp, "stride");
break;
}
case Instruction::Shl: {
Start = Builder.CreateShl(Start, SplatOp, "start");
- Step = Builder.CreateShl(Step, SplatOp, "step");
Stride = Builder.CreateShl(Stride, SplatOp, "stride");
break;
}
}
+ // Adjust the step value after its definition if it's an instruction.
+ if (auto *StepI = dyn_cast<Instruction>(Step))
+ Builder.SetInsertPoint(*StepI->getInsertionPointAfterDef());
+
+ switch (BO->getOpcode()) {
+ default:
+ break;
+ case Instruction::Mul:
+ Step = Builder.CreateMul(Step, SplatOp, "step");
+ break;
+ case Instruction::Shl:
+ Step = Builder.CreateShl(Step, SplatOp, "step");
+ break;
+ }
+
Inc->setOperand(StepIndex, Step);
BasePtr->setIncomingValue(StartBlock, Start);
return true;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
index 2cbbfc019ab4df..f26dfda3759898 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll
@@ -320,8 +320,8 @@ for.cond.cleanup: ; preds = %vector.body
define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
; CHECK-LABEL: @gather_unknown_pow2(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT:%.*]]
-; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT]]
+; CHECK-NEXT: [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]]
+; CHECK-NEXT: [[STEP:%.*]] = shl i64 8, [[SHIFT]]
; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
index d14b189d6958b2..87c8aa392062a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-load-store.ll
@@ -403,23 +403,22 @@ define <vscale x 1 x i64> @gather_loop_variant_step(ptr %a, i32 %len) {
; CHECK-LABEL: @gather_loop_variant_step(
; CHECK-NEXT: vector.ph:
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN:%.*]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACCUM:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[ELEMS:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[VEC_IND_SCALAR]]
; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 1, i1 true)
; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64
-; CHECK-NEXT: [[OFFSET:%.*]] = shl <vscale x 1 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[A:%.*]], <vscale x 1 x i64> [[OFFSET]], i32 3
-; CHECK-NEXT: [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.masked.gather.nxv1i64.nxv1p0(<vscale x 1 x ptr> [[TMP1]], i32 8, <vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> undef)
+; CHECK-NEXT: [[STEP:%.*]] = shl i64 [[EVL_ZEXT]], 4
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]], i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.vp.strided.load.nxv1i64.p0.i64(ptr [[TMP0]], i64 256, <vscale x 1 x i1> splat (i1 true), i32 [[TMP1]])
+; CHECK-NEXT: [[GATHER:%.*]] = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> splat (i1 true), <vscale x 1 x i64> [[TMP2]], <vscale x 1 x i64> undef, i32 [[TMP1]])
; CHECK-NEXT: [[ACCUM_NEXT]] = add <vscale x 1 x i64> [[ACCUM]], [[GATHER]]
; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], [[EVL_ZEXT]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[EVL_ZEXT]], i64 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], [[STEP]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[VEC_IND_NEXT_SCALAR]], [[WIDE_TRIP_COUNT]]
; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
More information about the llvm-commits
mailing list