[llvm] 1c7ec06 - [VPlan] Optimize LastActiveLane to EVL - 1 (#169766)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 27 01:03:14 PST 2025
Author: Luke Lau
Date: 2025-11-27T17:03:08+08:00
New Revision: 1c7ec06b16dc59b5b52cff95bde7d5330ffa0293
URL: https://github.com/llvm/llvm-project/commit/1c7ec06b16dc59b5b52cff95bde7d5330ffa0293
DIFF: https://github.com/llvm/llvm-project/commit/1c7ec06b16dc59b5b52cff95bde7d5330ffa0293.diff
LOG: [VPlan] Optimize LastActiveLane to EVL - 1 (#169766)
With EVL tail folding, the LastActiveLane can be computed with EVL - 1.
This removes the need for a header mask and vfirst.m for loops with live
outs on RISC-V:
# %bb.5: # %for.cond.cleanup7
- vsetvli zero, zero, e32, m2, ta, ma
- vmv.v.x v8, s1
- vmsleu.vv v10, v8, v22
- vfirst.m a0, v10
- srli a1, a0, 63
- czero.nez a0, a0, a1
- czero.eqz a1, s8, a1
- or a0, a0, a1
- addi a0, a0, -1
- vsetvli zero, zero, e64, m4, ta, ma
- vslidedown.vx v8, v12, a0
+ addi s1, s1, -1
+ vslidedown.vx v8, v12, s1
Added:
Modified:
llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ef82eeba8b9a5..4b7f90118374b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2736,6 +2736,7 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo, VPValue &EVL) {
VPlan *Plan = CurRecipe.getParent()->getPlan();
+ DebugLoc DL = CurRecipe.getDebugLoc();
VPValue *Addr, *Mask, *EndPtr;
/// Adjust any end pointers so that they point to the end of EVL lanes not VF.
@@ -2787,13 +2788,21 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
return new VPWidenIntrinsicRecipe(
Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
+ TypeInfo.inferScalarType(LHS), {}, {}, DL);
if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
m_VPValue(RHS))))
return new VPWidenIntrinsicRecipe(
Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
- TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
+ TypeInfo.inferScalarType(LHS), {}, {}, DL);
+
+ if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
+ Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
+ VPValue *ZExt =
+ VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
+ return new VPInstruction(Instruction::Sub,
+ {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
+ }
return nullptr;
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 41d3af2d5e763..9daf4236982bd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -87,12 +87,9 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 252, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 4, [[TMP2]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge <vscale x 16 x i32> [[TMP0]], [[BROADCAST_SPLAT3]]
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 16 x i32> [[VEC_IND]] to <vscale x 16 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC]], <vscale x 16 x i64> [[TMP9]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP2]]), !alias.scope [[META3:![0-9]+]]
@@ -103,7 +100,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP5]], i1 false)
+; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], 1
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 16
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index feaf26e366962..e4ba6fe9d757d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -656,13 +656,9 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 12, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 8, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[TMP4]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT5]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; CHECK-NEXT: [[TMP15:%.*]] = icmp uge <vscale x 8 x i32> [[TMP5]], [[BROADCAST_SPLAT8]]
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> [[TMP0]], <vscale x 8 x i8> [[BROADCAST_SPLAT2]], <vscale x 8 x i8> splat (i8 1), i32 [[TMP3]])
; CHECK-NEXT: [[TMP9:%.*]] = udiv <vscale x 8 x i8> [[VEC_IND]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 8 x i8> [[TMP9]] to <vscale x 8 x i16>
@@ -675,7 +671,7 @@ define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> [[TMP15]], i1 false)
+; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP3]] to i64
; CHECK-NEXT: [[TMP17:%.*]] = sub i64 [[TMP16]], 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 97c9fd75a42bb..e35db479dc963 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -24,10 +24,6 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 23, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP6]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP8:%.*]] = icmp uge <vscale x 2 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[EVL_BASED_IV]]
; CHECK-NEXT: [[VP_OP_LOAD]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP6]])
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> [[VECTOR_RECUR]], <vscale x 2 x i64> [[VP_OP_LOAD]], i32 -1, <vscale x 2 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP6]])
@@ -39,8 +35,7 @@ define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP8]], i1 false)
-; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], 1
+; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP12]], 1
; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], 1
; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[TMP17]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
index 361ff00674758..549222cd919da 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll
@@ -231,10 +231,6 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1025, [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -246,8 +242,7 @@ define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
-; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP5]], 1
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
index d6936a4871ed5..b95691f6e7c04 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll
@@ -417,10 +417,6 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[TC]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP4]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[TMP9]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
-; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP9]], i64 0
-; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-NEXT: [[TMP22:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; IF-EVL-NEXT: [[TMP23:%.*]] = icmp uge <vscale x 4 x i32> [[TMP22]], [[BROADCAST_SPLAT]]
; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]]
; IF-EVL-NEXT: [[WIDE_LOAD]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1, <vscale x 4 x i1> splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP9]])
@@ -433,8 +429,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) {
; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL: [[MIDDLE_BLOCK]]:
-; IF-EVL-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP23]], i1 false)
-; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 [[TMP27]], 1
+; IF-EVL-NEXT: [[TMP28:%.*]] = sub i64 [[TMP13]], 1
; IF-EVL-NEXT: [[TMP17:%.*]] = sub i64 [[TMP28]], 1
; IF-EVL-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 9a361e0b48a61..55e7018c49eec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -117,10 +117,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
-; SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; SCALABLE-NEXT: [[TMP6:%.*]] = load i64, ptr [[B]], align 8
; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i64 0
; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -132,8 +128,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; SCALABLE-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; SCALABLE: [[MIDDLE_BLOCK]]:
-; SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
-; SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; SCALABLE-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP7]], 2
; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP11]], 0
@@ -186,10 +181,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; TF-SCALABLE-NEXT: [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP0]], i64 0
-; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
-; TF-SCALABLE-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
-; TF-SCALABLE-NEXT: [[TMP2:%.*]] = icmp uge <vscale x 2 x i32> [[TMP1]], [[BROADCAST_SPLAT1]]
; TF-SCALABLE-NEXT: [[V:%.*]] = load i64, ptr [[B]], align 8
; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
@@ -201,8 +192,7 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap
; TF-SCALABLE-NEXT: [[TMP6:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; TF-SCALABLE-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; TF-SCALABLE: [[MIDDLE_BLOCK]]:
-; TF-SCALABLE-NEXT: [[FIRST_INACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> [[TMP2]], i1 false)
-; TF-SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[FIRST_INACTIVE_LANE]], 1
+; TF-SCALABLE-NEXT: [[LAST_ACTIVE_LANE:%.*]] = sub i64 [[TMP5]], 1
; TF-SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 0
More information about the llvm-commits
mailing list