[llvm] [LoopVectorizer][AArch64] Move getMinTripCountTailFoldingThreshold later. (PR #132170)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 20 07:43:47 PDT 2025
================
@@ -495,6 +478,235 @@ while.end:
ret void
}
+; This has a trip-count of 4, and should vectorize with vf==4.
+define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
+; CHECK-LABEL: define i32 @tc4(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
+; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
+; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 64
+; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
+; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
+; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 0, 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
+; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
+; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = sub <4 x i32> [[TMP5]], [[TMP11]]
+; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
+; CHECK-NEXT: [[TMP16:%.*]] = sub <4 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD4]]
+; CHECK-NEXT: [[TMP22:%.*]] = sub <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP26]], align 4
+; CHECK-NEXT: [[TMP27:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
+; CHECK-NEXT: [[TMP28:%.*]] = sub <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
+; CHECK-NEXT: [[TMP29:%.*]] = add <4 x i32> [[TMP27]], [[TMP21]]
+; CHECK-NEXT: [[TMP30:%.*]] = sub <4 x i32> [[TMP21]], [[TMP27]]
+; CHECK-NEXT: [[TMP31:%.*]] = add <4 x i32> [[TMP28]], [[TMP22]]
+; CHECK-NEXT: [[TMP32:%.*]] = sub <4 x i32> [[TMP22]], [[TMP28]]
+; CHECK-NEXT: [[TMP33:%.*]] = add <4 x i32> [[TMP29]], [[TMP13]]
+; CHECK-NEXT: [[TMP34:%.*]] = lshr <4 x i32> [[TMP33]], splat (i32 15)
+; CHECK-NEXT: [[TMP35:%.*]] = and <4 x i32> [[TMP34]], splat (i32 65537)
+; CHECK-NEXT: [[TMP36:%.*]] = mul nuw <4 x i32> [[TMP35]], splat (i32 65535)
+; CHECK-NEXT: [[TMP37:%.*]] = add <4 x i32> [[TMP36]], [[TMP33]]
+; CHECK-NEXT: [[TMP38:%.*]] = xor <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT: [[TMP39:%.*]] = sub <4 x i32> [[TMP13]], [[TMP29]]
+; CHECK-NEXT: [[TMP40:%.*]] = lshr <4 x i32> [[TMP39]], splat (i32 15)
+; CHECK-NEXT: [[TMP41:%.*]] = and <4 x i32> [[TMP40]], splat (i32 65537)
+; CHECK-NEXT: [[TMP42:%.*]] = mul nuw <4 x i32> [[TMP41]], splat (i32 65535)
+; CHECK-NEXT: [[TMP43:%.*]] = add <4 x i32> [[TMP42]], [[TMP39]]
+; CHECK-NEXT: [[TMP44:%.*]] = xor <4 x i32> [[TMP43]], [[TMP42]]
+; CHECK-NEXT: [[TMP45:%.*]] = add <4 x i32> [[TMP31]], [[TMP15]]
+; CHECK-NEXT: [[TMP46:%.*]] = lshr <4 x i32> [[TMP45]], splat (i32 15)
+; CHECK-NEXT: [[TMP47:%.*]] = and <4 x i32> [[TMP46]], splat (i32 65537)
+; CHECK-NEXT: [[TMP48:%.*]] = mul nuw <4 x i32> [[TMP47]], splat (i32 65535)
+; CHECK-NEXT: [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP45]]
+; CHECK-NEXT: [[TMP50:%.*]] = xor <4 x i32> [[TMP49]], [[TMP48]]
+; CHECK-NEXT: [[TMP51:%.*]] = sub <4 x i32> [[TMP15]], [[TMP31]]
+; CHECK-NEXT: [[TMP52:%.*]] = lshr <4 x i32> [[TMP51]], splat (i32 15)
+; CHECK-NEXT: [[TMP53:%.*]] = and <4 x i32> [[TMP52]], splat (i32 65537)
+; CHECK-NEXT: [[TMP54:%.*]] = mul nuw <4 x i32> [[TMP53]], splat (i32 65535)
+; CHECK-NEXT: [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP51]]
+; CHECK-NEXT: [[TMP56:%.*]] = xor <4 x i32> [[TMP55]], [[TMP54]]
+; CHECK-NEXT: [[TMP57:%.*]] = add <4 x i32> [[TMP30]], [[TMP14]]
+; CHECK-NEXT: [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], splat (i32 15)
+; CHECK-NEXT: [[TMP59:%.*]] = and <4 x i32> [[TMP58]], splat (i32 65537)
+; CHECK-NEXT: [[TMP60:%.*]] = mul nuw <4 x i32> [[TMP59]], splat (i32 65535)
+; CHECK-NEXT: [[TMP61:%.*]] = add <4 x i32> [[TMP60]], [[TMP57]]
+; CHECK-NEXT: [[TMP62:%.*]] = xor <4 x i32> [[TMP61]], [[TMP60]]
+; CHECK-NEXT: [[TMP63:%.*]] = sub <4 x i32> [[TMP14]], [[TMP30]]
+; CHECK-NEXT: [[TMP64:%.*]] = lshr <4 x i32> [[TMP63]], splat (i32 15)
+; CHECK-NEXT: [[TMP65:%.*]] = and <4 x i32> [[TMP64]], splat (i32 65537)
+; CHECK-NEXT: [[TMP66:%.*]] = mul nuw <4 x i32> [[TMP65]], splat (i32 65535)
+; CHECK-NEXT: [[TMP67:%.*]] = add <4 x i32> [[TMP66]], [[TMP63]]
+; CHECK-NEXT: [[TMP68:%.*]] = xor <4 x i32> [[TMP67]], [[TMP66]]
+; CHECK-NEXT: [[TMP69:%.*]] = add <4 x i32> [[TMP32]], [[TMP16]]
+; CHECK-NEXT: [[TMP70:%.*]] = lshr <4 x i32> [[TMP69]], splat (i32 15)
+; CHECK-NEXT: [[TMP71:%.*]] = and <4 x i32> [[TMP70]], splat (i32 65537)
+; CHECK-NEXT: [[TMP72:%.*]] = mul nuw <4 x i32> [[TMP71]], splat (i32 65535)
+; CHECK-NEXT: [[TMP73:%.*]] = add <4 x i32> [[TMP72]], [[TMP69]]
+; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i32> [[TMP73]], [[TMP72]]
+; CHECK-NEXT: [[TMP75:%.*]] = sub <4 x i32> [[TMP16]], [[TMP32]]
+; CHECK-NEXT: [[TMP76:%.*]] = lshr <4 x i32> [[TMP75]], splat (i32 15)
+; CHECK-NEXT: [[TMP77:%.*]] = and <4 x i32> [[TMP76]], splat (i32 65537)
+; CHECK-NEXT: [[TMP78:%.*]] = mul nuw <4 x i32> [[TMP77]], splat (i32 65535)
+; CHECK-NEXT: [[TMP79:%.*]] = add <4 x i32> [[TMP78]], [[TMP75]]
+; CHECK-NEXT: [[TMP80:%.*]] = xor <4 x i32> [[TMP79]], [[TMP78]]
+; CHECK-NEXT: [[TMP81:%.*]] = add <4 x i32> [[TMP74]], [[TMP80]]
+; CHECK-NEXT: [[TMP82:%.*]] = add <4 x i32> [[TMP81]], [[TMP68]]
+; CHECK-NEXT: [[TMP83:%.*]] = add <4 x i32> [[TMP82]], [[TMP62]]
+; CHECK-NEXT: [[TMP84:%.*]] = add <4 x i32> [[TMP83]], [[TMP44]]
+; CHECK-NEXT: [[TMP85:%.*]] = add <4 x i32> [[TMP84]], [[TMP38]]
+; CHECK-NEXT: [[TMP86:%.*]] = add <4 x i32> [[TMP85]], [[TMP56]]
+; CHECK-NEXT: [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP50]]
+; CHECK-NEXT: [[TMP88:%.*]] = and <4 x i32> [[TMP87]], splat (i32 65535)
+; CHECK-NEXT: [[TMP89:%.*]] = lshr <4 x i32> [[TMP87]], splat (i32 16)
+; CHECK-NEXT: [[TMP90:%.*]] = add <4 x i32> [[TMP89]], zeroinitializer
+; CHECK-NEXT: [[TMP91:%.*]] = add <4 x i32> [[TMP90]], [[TMP88]]
+; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
+; CHECK-NEXT: [[TMP92:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP91]])
+; CHECK-NEXT: ret i32 [[TMP92]]
+;
+entry:
+ %arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16
----------------
david-arm wrote:
Can you shrink this loop down to something a lot smaller that demonstrates the same effect? I suspect a trivial memcpy-like loop would work, since all you really care about is using 32-bit element types to get the VF=4 result.
https://github.com/llvm/llvm-project/pull/132170
More information about the llvm-commits
mailing list