[llvm] [VPlan] Don't use the legacy cost model for loop conditions (PR #156864)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 05:54:13 PDT 2025
================
@@ -427,71 +427,53 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; PRED-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
; PRED-NEXT: br i1 [[TMP13]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; PRED: [[VECTOR_PH]]:
-; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4
-; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4
+; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 2
+; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 2
; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0
-; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
-; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[MUL]], i64 0
+; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
; PRED: [[VECTOR_BODY]]:
-; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
-; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
-; PRED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE7]] ]
+; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE3]] ]
+; PRED-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE3]] ]
; PRED-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; PRED-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; PRED-NEXT: [[TMP18:%.*]] = zext <4 x i32> [[TMP17]] to <4 x i64>
-; PRED-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; PRED-NEXT: [[TMP17:%.*]] = mul <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; PRED-NEXT: [[TMP18:%.*]] = zext <2 x i32> [[TMP17]] to <2 x i64>
+; PRED-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0
; PRED-NEXT: br i1 [[TMP19]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; PRED: [[PRED_STORE_IF]]:
-; PRED-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; PRED-NEXT: [[TMP20:%.*]] = extractelement <2 x i64> [[TMP18]], i32 0
; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP20]]
; PRED-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 0
; PRED-NEXT: store i32 [[TMP22]], ptr [[TMP21]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE]]
; PRED: [[PRED_STORE_CONTINUE]]:
-; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
-; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
+; PRED-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; PRED-NEXT: br i1 [[TMP23]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
; PRED: [[PRED_STORE_IF2]]:
-; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; PRED-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP18]], i32 1
; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE3]]
; PRED: [[PRED_STORE_CONTINUE3]]:
-; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
-; PRED-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; PRED: [[PRED_STORE_IF4]]:
-; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
-; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
-; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
-; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE5]]
-; PRED: [[PRED_STORE_CONTINUE5]]:
-; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
-; PRED-NEXT: br i1 [[TMP31]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; PRED: [[PRED_STORE_IF6]]:
-; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
-; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
-; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
-; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4
-; PRED-NEXT: br label %[[PRED_STORE_CONTINUE7]]
-; PRED: [[PRED_STORE_CONTINUE7]]:
-; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
-; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP16]])
+; PRED-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
; PRED-NEXT: [[TMP36:%.*]] = xor i1 [[TMP35]], true
-; PRED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; PRED-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], splat (i32 2)
----------------
david-arm wrote:
Do you know why we're choosing a lower VF here? It may be a sensible change, but it's not obvious what triggered the different choice of VF as it doesn't seem like a FCmp, ICmp or BranchOnCount VPInstruction being generated here.
https://github.com/llvm/llvm-project/pull/156864
More information about the llvm-commits
mailing list