[llvm] [LoopVectorizer][AArch64] Move getMinTripCountTailFoldingThreshold later. (PR #132170)

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 26 04:35:43 PDT 2025


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/132170

>From 6fb8134598aee48fdce33bbc9a95157fb4bc6964 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 19 Mar 2025 14:41:25 +0000
Subject: [PATCH 1/5] [AArch64][LoopVectorizer] Add a test for VF=4
 low-trip-count vectorization. NFC

This also adds simplifycfg to make it clear what it the result of vectorizing
the loops in the test.
---
 .../AArch64/low_trip_count_predicates.ll      | 272 ++++++++++++++++--
 1 file changed, 241 insertions(+), 31 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index a39c324ca7016..b8a0966696c8b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS1
+; RUN: opt -S < %s -p "loop-vectorize,simplifycfg" -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS1
 ; RUN: cat %t | FileCheck %s --check-prefixes=DEBUG,DEBUG-VS1
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mcpu=neoverse-v1 -sve-tail-folding=disabled 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS2
+; RUN: opt -S < %s -p "loop-vectorize,simplifycfg" -debug-only=loop-vectorize -mcpu=neoverse-v1 -sve-tail-folding=disabled 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS2
 ; RUN: cat %t | FileCheck %s --check-prefixes=DEBUG,DEBUG-VS2
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -90,7 +90,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VS1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VS1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS1-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK-VS1-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-VS1-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
@@ -123,7 +123,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VS1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
-; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
@@ -136,9 +136,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-VS1-NEXT:    [[TMP38:%.*]] = and i64 [[IV_NEXT]], 4294967295
 ; CHECK-VS1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP38]], 19
-; CHECK-VS1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VS1:       [[WHILE_END_LOOPEXIT]]:
-; CHECK-VS1-NEXT:    br label %[[WHILE_END]]
+; CHECK-VS1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VS1:       [[WHILE_END]]:
 ; CHECK-VS1-NEXT:    ret void
 ;
@@ -196,7 +194,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VS2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VS2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS2-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK-VS2-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS2:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-VS2-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
@@ -229,7 +227,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VS2:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
-; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
@@ -242,9 +240,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-VS2-NEXT:    [[TMP38:%.*]] = and i64 [[IV_NEXT]], 4294967295
 ; CHECK-VS2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP38]], 19
-; CHECK-VS2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VS2:       [[WHILE_END_LOOPEXIT]]:
-; CHECK-VS2-NEXT:    br label %[[WHILE_END]]
+; CHECK-VS2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-VS2:       [[WHILE_END]]:
 ; CHECK-VS2-NEXT:    ret void
 ;
@@ -293,9 +289,7 @@ define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
-; CHECK:       [[WHILE_END_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -353,9 +347,7 @@ define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture nou
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
-; CHECK:       [[WHILE_END_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -406,8 +398,6 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TC]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1028, [[TMP20]]
-; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TC]], 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub i64 1027, [[TMP22]]
@@ -416,7 +406,7 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295
 ; CHECK-NEXT:    [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[WHILE_BODY:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
@@ -444,14 +434,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[WHILE_END]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[WHILE_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -459,9 +444,7 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       [[WHILE_END_LOOPEXIT]]:
-; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -490,6 +473,233 @@ while.end:
   ret void
 }
 
+; This has a trip-count of 4, and should vectorize with vf==4.
+define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
+; CHECK-LABEL: define i32 @tc4(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 64
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
+; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    [[ADD89_LCSSA:%.*]] = phi i32 [ [[ADD89:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    ret i32 [[ADD89_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD89]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[ADD17:%.*]] = add i32 [[TMP11]], [[TMP2]]
+; CHECK-NEXT:    [[SUB24:%.*]] = sub i32 [[TMP2]], [[TMP11]]
+; CHECK-NEXT:    [[ADD25:%.*]] = add i32 [[ADD17]], [[ADD]]
+; CHECK-NEXT:    [[SUB26:%.*]] = sub i32 [[ADD]], [[ADD17]]
+; CHECK-NEXT:    [[ADD27:%.*]] = add i32 [[SUB24]], [[SUB]]
+; CHECK-NEXT:    [[SUB28:%.*]] = sub i32 [[SUB]], [[SUB24]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[ADD36:%.*]] = add i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[ADD52:%.*]] = add i32 [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[ADD61:%.*]] = add i32 [[ADD52]], [[ADD36]]
+; CHECK-NEXT:    [[SUB62:%.*]] = sub i32 [[ADD36]], [[ADD52]]
+; CHECK-NEXT:    [[ADD63:%.*]] = add i32 [[SUB60]], [[SUB44]]
+; CHECK-NEXT:    [[SUB64:%.*]] = sub i32 [[SUB44]], [[SUB60]]
+; CHECK-NEXT:    [[ADD65:%.*]] = add i32 [[ADD61]], [[ADD25]]
+; CHECK-NEXT:    [[SHR_I173:%.*]] = lshr i32 [[ADD65]], 15
+; CHECK-NEXT:    [[AND_I174:%.*]] = and i32 [[SHR_I173]], 65537
+; CHECK-NEXT:    [[MUL_I175:%.*]] = mul nuw i32 [[AND_I174]], 65535
+; CHECK-NEXT:    [[ADD_I176:%.*]] = add i32 [[MUL_I175]], [[ADD65]]
+; CHECK-NEXT:    [[XOR_I177:%.*]] = xor i32 [[ADD_I176]], [[MUL_I175]]
+; CHECK-NEXT:    [[SUB66:%.*]] = sub i32 [[ADD25]], [[ADD61]]
+; CHECK-NEXT:    [[SHR_I168:%.*]] = lshr i32 [[SUB66]], 15
+; CHECK-NEXT:    [[AND_I169:%.*]] = and i32 [[SHR_I168]], 65537
+; CHECK-NEXT:    [[MUL_I170:%.*]] = mul nuw i32 [[AND_I169]], 65535
+; CHECK-NEXT:    [[ADD_I171:%.*]] = add i32 [[MUL_I170]], [[SUB66]]
+; CHECK-NEXT:    [[XOR_I172:%.*]] = xor i32 [[ADD_I171]], [[MUL_I170]]
+; CHECK-NEXT:    [[ADD69:%.*]] = add i32 [[ADD63]], [[ADD27]]
+; CHECK-NEXT:    [[SHR_I163:%.*]] = lshr i32 [[ADD69]], 15
+; CHECK-NEXT:    [[AND_I164:%.*]] = and i32 [[SHR_I163]], 65537
+; CHECK-NEXT:    [[MUL_I165:%.*]] = mul nuw i32 [[AND_I164]], 65535
+; CHECK-NEXT:    [[ADD_I166:%.*]] = add i32 [[MUL_I165]], [[ADD69]]
+; CHECK-NEXT:    [[XOR_I167:%.*]] = xor i32 [[ADD_I166]], [[MUL_I165]]
+; CHECK-NEXT:    [[SUB71:%.*]] = sub i32 [[ADD27]], [[ADD63]]
+; CHECK-NEXT:    [[SHR_I158:%.*]] = lshr i32 [[SUB71]], 15
+; CHECK-NEXT:    [[AND_I159:%.*]] = and i32 [[SHR_I158]], 65537
+; CHECK-NEXT:    [[MUL_I160:%.*]] = mul nuw i32 [[AND_I159]], 65535
+; CHECK-NEXT:    [[ADD_I161:%.*]] = add i32 [[MUL_I160]], [[SUB71]]
+; CHECK-NEXT:    [[XOR_I162:%.*]] = xor i32 [[ADD_I161]], [[MUL_I160]]
+; CHECK-NEXT:    [[ADD75:%.*]] = add i32 [[SUB62]], [[SUB26]]
+; CHECK-NEXT:    [[SHR_I153:%.*]] = lshr i32 [[ADD75]], 15
+; CHECK-NEXT:    [[AND_I154:%.*]] = and i32 [[SHR_I153]], 65537
+; CHECK-NEXT:    [[MUL_I155:%.*]] = mul nuw i32 [[AND_I154]], 65535
+; CHECK-NEXT:    [[ADD_I156:%.*]] = add i32 [[MUL_I155]], [[ADD75]]
+; CHECK-NEXT:    [[XOR_I157:%.*]] = xor i32 [[ADD_I156]], [[MUL_I155]]
+; CHECK-NEXT:    [[SUB77:%.*]] = sub i32 [[SUB26]], [[SUB62]]
+; CHECK-NEXT:    [[SHR_I148:%.*]] = lshr i32 [[SUB77]], 15
+; CHECK-NEXT:    [[AND_I149:%.*]] = and i32 [[SHR_I148]], 65537
+; CHECK-NEXT:    [[MUL_I150:%.*]] = mul nuw i32 [[AND_I149]], 65535
+; CHECK-NEXT:    [[ADD_I151:%.*]] = add i32 [[MUL_I150]], [[SUB77]]
+; CHECK-NEXT:    [[XOR_I152:%.*]] = xor i32 [[ADD_I151]], [[MUL_I150]]
+; CHECK-NEXT:    [[ADD81:%.*]] = add i32 [[SUB64]], [[SUB28]]
+; CHECK-NEXT:    [[SHR_I143:%.*]] = lshr i32 [[ADD81]], 15
+; CHECK-NEXT:    [[AND_I144:%.*]] = and i32 [[SHR_I143]], 65537
+; CHECK-NEXT:    [[MUL_I145:%.*]] = mul nuw i32 [[AND_I144]], 65535
+; CHECK-NEXT:    [[ADD_I146:%.*]] = add i32 [[MUL_I145]], [[ADD81]]
+; CHECK-NEXT:    [[XOR_I147:%.*]] = xor i32 [[ADD_I146]], [[MUL_I145]]
+; CHECK-NEXT:    [[SUB83:%.*]] = sub i32 [[SUB28]], [[SUB64]]
+; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[SUB83]], 15
+; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
+; CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUB83]]
+; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
+; CHECK-NEXT:    [[ADD73:%.*]] = add i32 [[XOR_I147]], [[XOR_I]]
+; CHECK-NEXT:    [[ADD68:%.*]] = add i32 [[ADD73]], [[XOR_I152]]
+; CHECK-NEXT:    [[ADD74:%.*]] = add i32 [[ADD68]], [[XOR_I157]]
+; CHECK-NEXT:    [[ADD79:%.*]] = add i32 [[ADD74]], [[XOR_I172]]
+; CHECK-NEXT:    [[ADD80:%.*]] = add i32 [[ADD79]], [[XOR_I177]]
+; CHECK-NEXT:    [[ADD85:%.*]] = add i32 [[ADD80]], [[XOR_I162]]
+; CHECK-NEXT:    [[ADD86:%.*]] = add i32 [[ADD85]], [[XOR_I167]]
+; CHECK-NEXT:    [[CONV87:%.*]] = and i32 [[ADD86]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD86]], 16
+; CHECK-NEXT:    [[ADD88:%.*]] = add i32 [[SHR]], [[SUM_0179]]
+; CHECK-NEXT:    [[ADD89]] = add i32 [[ADD88]], [[CONV87]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+;
+entry:
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16
+  %arrayidx11 = getelementptr inbounds nuw i8, ptr %tmp, i64 32
+  %arrayidx14 = getelementptr inbounds nuw i8, ptr %tmp, i64 48
+  %arrayidx30 = getelementptr inbounds nuw i8, ptr %tmp, i64 64
+  %arrayidx33 = getelementptr inbounds nuw i8, ptr %tmp, i64 80
+  %arrayidx46 = getelementptr inbounds nuw i8, ptr %tmp, i64 96
+  %arrayidx49 = getelementptr inbounds nuw i8, ptr %tmp, i64 112
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add89.lcssa = phi i32 [ %add89, %for.body ]
+  ret i32 %add89.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.0179 = phi i32 [ 0, %entry ], [ %add89, %for.body ]
+  %arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx4 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx2, i64 0, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx4, align 4
+  %add = add i32 %1, %0
+  %sub = sub i32 %0, %1
+  %arrayidx13 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx11, i64 0, i64 %indvars.iv
+  %2 = load i32, ptr %arrayidx13, align 4
+  %arrayidx16 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx14, i64 0, i64 %indvars.iv
+  %3 = load i32, ptr %arrayidx16, align 4
+  %add17 = add i32 %3, %2
+  %sub24 = sub i32 %2, %3
+  %add25 = add i32 %add17, %add
+  %sub26 = sub i32 %add, %add17
+  %add27 = add i32 %sub24, %sub
+  %sub28 = sub i32 %sub, %sub24
+  %arrayidx32 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx30, i64 0, i64 %indvars.iv
+  %4 = load i32, ptr %arrayidx32, align 4
+  %arrayidx35 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx33, i64 0, i64 %indvars.iv
+  %5 = load i32, ptr %arrayidx35, align 4
+  %add36 = add i32 %5, %4
+  %sub44 = sub i32 %4, %5
+  %arrayidx48 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx46, i64 0, i64 %indvars.iv
+  %6 = load i32, ptr %arrayidx48, align 4
+  %arrayidx51 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx49, i64 0, i64 %indvars.iv
+  %7 = load i32, ptr %arrayidx51, align 4
+  %add52 = add i32 %7, %6
+  %sub60 = sub i32 %6, %7
+  %add61 = add i32 %add52, %add36
+  %sub62 = sub i32 %add36, %add52
+  %add63 = add i32 %sub60, %sub44
+  %sub64 = sub i32 %sub44, %sub60
+  %add65 = add i32 %add61, %add25
+  %shr.i173 = lshr i32 %add65, 15
+  %and.i174 = and i32 %shr.i173, 65537
+  %mul.i175 = mul nuw i32 %and.i174, 65535
+  %add.i176 = add i32 %mul.i175, %add65
+  %xor.i177 = xor i32 %add.i176, %mul.i175
+  %sub66 = sub i32 %add25, %add61
+  %shr.i168 = lshr i32 %sub66, 15
+  %and.i169 = and i32 %shr.i168, 65537
+  %mul.i170 = mul nuw i32 %and.i169, 65535
+  %add.i171 = add i32 %mul.i170, %sub66
+  %xor.i172 = xor i32 %add.i171, %mul.i170
+  %add69 = add i32 %add63, %add27
+  %shr.i163 = lshr i32 %add69, 15
+  %and.i164 = and i32 %shr.i163, 65537
+  %mul.i165 = mul nuw i32 %and.i164, 65535
+  %add.i166 = add i32 %mul.i165, %add69
+  %xor.i167 = xor i32 %add.i166, %mul.i165
+  %sub71 = sub i32 %add27, %add63
+  %shr.i158 = lshr i32 %sub71, 15
+  %and.i159 = and i32 %shr.i158, 65537
+  %mul.i160 = mul nuw i32 %and.i159, 65535
+  %add.i161 = add i32 %mul.i160, %sub71
+  %xor.i162 = xor i32 %add.i161, %mul.i160
+  %add75 = add i32 %sub62, %sub26
+  %shr.i153 = lshr i32 %add75, 15
+  %and.i154 = and i32 %shr.i153, 65537
+  %mul.i155 = mul nuw i32 %and.i154, 65535
+  %add.i156 = add i32 %mul.i155, %add75
+  %xor.i157 = xor i32 %add.i156, %mul.i155
+  %sub77 = sub i32 %sub26, %sub62
+  %shr.i148 = lshr i32 %sub77, 15
+  %and.i149 = and i32 %shr.i148, 65537
+  %mul.i150 = mul nuw i32 %and.i149, 65535
+  %add.i151 = add i32 %mul.i150, %sub77
+  %xor.i152 = xor i32 %add.i151, %mul.i150
+  %add81 = add i32 %sub64, %sub28
+  %shr.i143 = lshr i32 %add81, 15
+  %and.i144 = and i32 %shr.i143, 65537
+  %mul.i145 = mul nuw i32 %and.i144, 65535
+  %add.i146 = add i32 %mul.i145, %add81
+  %xor.i147 = xor i32 %add.i146, %mul.i145
+  %sub83 = sub i32 %sub28, %sub64
+  %shr.i = lshr i32 %sub83, 15
+  %and.i = and i32 %shr.i, 65537
+  %mul.i = mul nuw i32 %and.i, 65535
+  %add.i = add i32 %mul.i, %sub83
+  %xor.i = xor i32 %add.i, %mul.i
+  %add73 = add i32 %xor.i147, %xor.i
+  %add68 = add i32 %add73, %xor.i152
+  %add74 = add i32 %add68, %xor.i157
+  %add79 = add i32 %add74, %xor.i172
+  %add80 = add i32 %add79, %xor.i177
+  %add85 = add i32 %add80, %xor.i162
+  %add86 = add i32 %add85, %xor.i167
+  %conv87 = and i32 %add86, 65535
+  %shr = lshr i32 %add86, 16
+  %add88 = add i32 %shr, %sum.0179
+  %add89 = add i32 %add88, %conv87
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}

>From d189001dfe7320edd8ebd2623e8490117b536831 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 20 Mar 2025 08:46:13 +0000
Subject: [PATCH 2/5] [LoopVectorizer][AArch64] Move
 getMinTripCountTailFoldingThreshold later.

This moves the checks of MinTripCountTailFoldingThreshold later, during the
calculation of whether to tail fold. This allows it to check beforehand whether
tail predication is required, either for scalable or fixed-width vectors.

This option is only specified for AArch64, where it returns the minimum of 5.
This patch aims to allow the vectorization of TC=4 loops, preventing them from
performing slower when SVE is present.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  65 +++---
 .../AArch64/low_trip_count_predicates.ll      | 192 +++++++++---------
 2 files changed, 136 insertions(+), 121 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dbed779580dcc..22ea2618324f4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4025,11 +4025,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
   }
 
-  if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
-    assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
-           "MaxFixedVF must be a power of 2");
-    unsigned MaxVFtimesIC =
-        UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
+  auto IsKnownModTripCountZero = [this, &UserIC](unsigned MaxVF) {
+    unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
     ScalarEvolution *SE = PSE.getSE();
     // Currently only loops with countable exits are vectorized, but calling
     // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
@@ -4043,13 +4040,40 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     const SCEV *Rem = SE->getURemExpr(
         SE->applyLoopGuards(ExitCount, TheLoop),
         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
-    if (Rem->isZero()) {
+    return Rem->isZero();
+  };
+
+  if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+    assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
+           "MaxFixedVF must be a power of 2");
+    if (IsKnownModTripCountZero(*MaxPowerOf2RuntimeVF)) {
       // Accept MaxFixedVF if we do not have a tail.
       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
       return MaxFactors;
     }
   }
 
+  if (MaxTC && MaxTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+    if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+      // If we have a low-trip-count, and the fixed-width VF is known to divide
+      // the trip count but the scalable factor does not, use the fixed-width
+      // factor in preference to allow the generation of a non-predicated loop.
+      if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
+          IsKnownModTripCountZero(MaxFactors.FixedVF.getFixedValue())) {
+        LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
+                             "remain for any chosen VF.\n");
+        MaxFactors.ScalableVF = ElementCount::getScalable(0);
+        return MaxFactors;
+      }
+    }
+
+    reportVectorizationFailure(
+        "The trip count is below the minial threshold value.",
+        "loop trip count is too low, avoiding vectorization", "LowTripCount",
+        ORE, TheLoop);
+    return FixedScalableVFPair::getNone();
+  }
+
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
@@ -10597,26 +10621,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
-      if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
-        LLVM_DEBUG(dbgs() << "\n");
-        // Predicate tail-folded loops are efficient even when the loop
-        // iteration count is low. However, setting the epilogue policy to
-        // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
-        // with runtime checks. It's more effective to let
-        // `isOutsideLoopWorkProfitable` determine if vectorization is
-        // beneficial for the loop.
-        if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
-          SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
-      } else {
-        LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
-                             "small to consider vectorizing.\n");
-        reportVectorizationFailure(
-            "The trip count is below the minial threshold value.",
-            "loop trip count is too low, avoiding vectorization",
-            "LowTripCount", ORE, L);
-        Hints.emitRemarkWithHints();
-        return false;
-      }
+      LLVM_DEBUG(dbgs() << "\n");
+      // Predicate tail-folded loops are efficient even when the loop
+      // iteration count is low. However, setting the epilogue policy to
+      // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+      // with runtime checks. It's more effective to let
+      // `isOutsideLoopWorkProfitable` determine if vectorization is
+      // beneficial for the loop.
+      if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
     }
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index b8a0966696c8b..622f410f26f3c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
 
 ; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
 ; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
-; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value..
+; DEBUG: LV: Not vectorizing: Runtime SCEV check is required with -Os/-Oz.
 
 ; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks'
 ; DEBUG: LV: Found trip count: 0
@@ -477,7 +477,7 @@ while.end:
 define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK-LABEL: define i32 @tc4(
 ; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
@@ -485,107 +485,109 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
 ; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
 ; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP:.*]]:
-; CHECK-NEXT:    [[ADD89_LCSSA:%.*]] = phi i32 [ [[ADD89:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    ret i32 [[ADD89_LCSSA]]
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD89]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 0, 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[ADD17:%.*]] = add i32 [[TMP11]], [[TMP2]]
-; CHECK-NEXT:    [[SUB24:%.*]] = sub i32 [[TMP2]], [[TMP11]]
-; CHECK-NEXT:    [[ADD25:%.*]] = add i32 [[ADD17]], [[ADD]]
-; CHECK-NEXT:    [[SUB26:%.*]] = sub i32 [[ADD]], [[ADD17]]
-; CHECK-NEXT:    [[ADD27:%.*]] = add i32 [[SUB24]], [[SUB]]
-; CHECK-NEXT:    [[SUB28:%.*]] = sub i32 [[SUB]], [[SUB24]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sub <4 x i32> [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sub <4 x i32> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP17]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP19]], align 4
-; CHECK-NEXT:    [[ADD36:%.*]] = add i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[SUB44:%.*]] = sub i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP22:%.*]] = sub <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    [[ADD52:%.*]] = add i32 [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[SUB60:%.*]] = sub i32 [[TMP6]], [[TMP12]]
-; CHECK-NEXT:    [[ADD61:%.*]] = add i32 [[ADD52]], [[ADD36]]
-; CHECK-NEXT:    [[SUB62:%.*]] = sub i32 [[ADD36]], [[ADD52]]
-; CHECK-NEXT:    [[ADD63:%.*]] = add i32 [[SUB60]], [[SUB44]]
-; CHECK-NEXT:    [[SUB64:%.*]] = sub i32 [[SUB44]], [[SUB60]]
-; CHECK-NEXT:    [[ADD65:%.*]] = add i32 [[ADD61]], [[ADD25]]
-; CHECK-NEXT:    [[SHR_I173:%.*]] = lshr i32 [[ADD65]], 15
-; CHECK-NEXT:    [[AND_I174:%.*]] = and i32 [[SHR_I173]], 65537
-; CHECK-NEXT:    [[MUL_I175:%.*]] = mul nuw i32 [[AND_I174]], 65535
-; CHECK-NEXT:    [[ADD_I176:%.*]] = add i32 [[MUL_I175]], [[ADD65]]
-; CHECK-NEXT:    [[XOR_I177:%.*]] = xor i32 [[ADD_I176]], [[MUL_I175]]
-; CHECK-NEXT:    [[SUB66:%.*]] = sub i32 [[ADD25]], [[ADD61]]
-; CHECK-NEXT:    [[SHR_I168:%.*]] = lshr i32 [[SUB66]], 15
-; CHECK-NEXT:    [[AND_I169:%.*]] = and i32 [[SHR_I168]], 65537
-; CHECK-NEXT:    [[MUL_I170:%.*]] = mul nuw i32 [[AND_I169]], 65535
-; CHECK-NEXT:    [[ADD_I171:%.*]] = add i32 [[MUL_I170]], [[SUB66]]
-; CHECK-NEXT:    [[XOR_I172:%.*]] = xor i32 [[ADD_I171]], [[MUL_I170]]
-; CHECK-NEXT:    [[ADD69:%.*]] = add i32 [[ADD63]], [[ADD27]]
-; CHECK-NEXT:    [[SHR_I163:%.*]] = lshr i32 [[ADD69]], 15
-; CHECK-NEXT:    [[AND_I164:%.*]] = and i32 [[SHR_I163]], 65537
-; CHECK-NEXT:    [[MUL_I165:%.*]] = mul nuw i32 [[AND_I164]], 65535
-; CHECK-NEXT:    [[ADD_I166:%.*]] = add i32 [[MUL_I165]], [[ADD69]]
-; CHECK-NEXT:    [[XOR_I167:%.*]] = xor i32 [[ADD_I166]], [[MUL_I165]]
-; CHECK-NEXT:    [[SUB71:%.*]] = sub i32 [[ADD27]], [[ADD63]]
-; CHECK-NEXT:    [[SHR_I158:%.*]] = lshr i32 [[SUB71]], 15
-; CHECK-NEXT:    [[AND_I159:%.*]] = and i32 [[SHR_I158]], 65537
-; CHECK-NEXT:    [[MUL_I160:%.*]] = mul nuw i32 [[AND_I159]], 65535
-; CHECK-NEXT:    [[ADD_I161:%.*]] = add i32 [[MUL_I160]], [[SUB71]]
-; CHECK-NEXT:    [[XOR_I162:%.*]] = xor i32 [[ADD_I161]], [[MUL_I160]]
-; CHECK-NEXT:    [[ADD75:%.*]] = add i32 [[SUB62]], [[SUB26]]
-; CHECK-NEXT:    [[SHR_I153:%.*]] = lshr i32 [[ADD75]], 15
-; CHECK-NEXT:    [[AND_I154:%.*]] = and i32 [[SHR_I153]], 65537
-; CHECK-NEXT:    [[MUL_I155:%.*]] = mul nuw i32 [[AND_I154]], 65535
-; CHECK-NEXT:    [[ADD_I156:%.*]] = add i32 [[MUL_I155]], [[ADD75]]
-; CHECK-NEXT:    [[XOR_I157:%.*]] = xor i32 [[ADD_I156]], [[MUL_I155]]
-; CHECK-NEXT:    [[SUB77:%.*]] = sub i32 [[SUB26]], [[SUB62]]
-; CHECK-NEXT:    [[SHR_I148:%.*]] = lshr i32 [[SUB77]], 15
-; CHECK-NEXT:    [[AND_I149:%.*]] = and i32 [[SHR_I148]], 65537
-; CHECK-NEXT:    [[MUL_I150:%.*]] = mul nuw i32 [[AND_I149]], 65535
-; CHECK-NEXT:    [[ADD_I151:%.*]] = add i32 [[MUL_I150]], [[SUB77]]
-; CHECK-NEXT:    [[XOR_I152:%.*]] = xor i32 [[ADD_I151]], [[MUL_I150]]
-; CHECK-NEXT:    [[ADD81:%.*]] = add i32 [[SUB64]], [[SUB28]]
-; CHECK-NEXT:    [[SHR_I143:%.*]] = lshr i32 [[ADD81]], 15
-; CHECK-NEXT:    [[AND_I144:%.*]] = and i32 [[SHR_I143]], 65537
-; CHECK-NEXT:    [[MUL_I145:%.*]] = mul nuw i32 [[AND_I144]], 65535
-; CHECK-NEXT:    [[ADD_I146:%.*]] = add i32 [[MUL_I145]], [[ADD81]]
-; CHECK-NEXT:    [[XOR_I147:%.*]] = xor i32 [[ADD_I146]], [[MUL_I145]]
-; CHECK-NEXT:    [[SUB83:%.*]] = sub i32 [[SUB28]], [[SUB64]]
-; CHECK-NEXT:    [[SHR_I:%.*]] = lshr i32 [[SUB83]], 15
-; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
-; CHECK-NEXT:    [[MUL_I:%.*]] = mul nuw i32 [[AND_I]], 65535
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUB83]]
-; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[MUL_I]]
-; CHECK-NEXT:    [[ADD73:%.*]] = add i32 [[XOR_I147]], [[XOR_I]]
-; CHECK-NEXT:    [[ADD68:%.*]] = add i32 [[ADD73]], [[XOR_I152]]
-; CHECK-NEXT:    [[ADD74:%.*]] = add i32 [[ADD68]], [[XOR_I157]]
-; CHECK-NEXT:    [[ADD79:%.*]] = add i32 [[ADD74]], [[XOR_I172]]
-; CHECK-NEXT:    [[ADD80:%.*]] = add i32 [[ADD79]], [[XOR_I177]]
-; CHECK-NEXT:    [[ADD85:%.*]] = add i32 [[ADD80]], [[XOR_I162]]
-; CHECK-NEXT:    [[ADD86:%.*]] = add i32 [[ADD85]], [[XOR_I167]]
-; CHECK-NEXT:    [[CONV87:%.*]] = and i32 [[ADD86]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD86]], 16
-; CHECK-NEXT:    [[ADD88:%.*]] = add i32 [[SHR]], [[SUM_0179]]
-; CHECK-NEXT:    [[ADD89]] = add i32 [[ADD88]], [[CONV87]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP28:%.*]] = sub <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP27]], [[TMP21]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sub <4 x i32> [[TMP21]], [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = add <4 x i32> [[TMP28]], [[TMP22]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sub <4 x i32> [[TMP22]], [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add <4 x i32> [[TMP29]], [[TMP13]]
+; CHECK-NEXT:    [[TMP34:%.*]] = lshr <4 x i32> [[TMP33]], splat (i32 15)
+; CHECK-NEXT:    [[TMP35:%.*]] = and <4 x i32> [[TMP34]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP36:%.*]] = mul nuw <4 x i32> [[TMP35]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP37:%.*]] = add <4 x i32> [[TMP36]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = xor <4 x i32> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = sub <4 x i32> [[TMP13]], [[TMP29]]
+; CHECK-NEXT:    [[TMP40:%.*]] = lshr <4 x i32> [[TMP39]], splat (i32 15)
+; CHECK-NEXT:    [[TMP41:%.*]] = and <4 x i32> [[TMP40]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw <4 x i32> [[TMP41]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP43:%.*]] = add <4 x i32> [[TMP42]], [[TMP39]]
+; CHECK-NEXT:    [[TMP44:%.*]] = xor <4 x i32> [[TMP43]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = add <4 x i32> [[TMP31]], [[TMP15]]
+; CHECK-NEXT:    [[TMP46:%.*]] = lshr <4 x i32> [[TMP45]], splat (i32 15)
+; CHECK-NEXT:    [[TMP47:%.*]] = and <4 x i32> [[TMP46]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP48:%.*]] = mul nuw <4 x i32> [[TMP47]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP45]]
+; CHECK-NEXT:    [[TMP50:%.*]] = xor <4 x i32> [[TMP49]], [[TMP48]]
+; CHECK-NEXT:    [[TMP51:%.*]] = sub <4 x i32> [[TMP15]], [[TMP31]]
+; CHECK-NEXT:    [[TMP52:%.*]] = lshr <4 x i32> [[TMP51]], splat (i32 15)
+; CHECK-NEXT:    [[TMP53:%.*]] = and <4 x i32> [[TMP52]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP54:%.*]] = mul nuw <4 x i32> [[TMP53]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP51]]
+; CHECK-NEXT:    [[TMP56:%.*]] = xor <4 x i32> [[TMP55]], [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = add <4 x i32> [[TMP30]], [[TMP14]]
+; CHECK-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], splat (i32 15)
+; CHECK-NEXT:    [[TMP59:%.*]] = and <4 x i32> [[TMP58]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP60:%.*]] = mul nuw <4 x i32> [[TMP59]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP61:%.*]] = add <4 x i32> [[TMP60]], [[TMP57]]
+; CHECK-NEXT:    [[TMP62:%.*]] = xor <4 x i32> [[TMP61]], [[TMP60]]
+; CHECK-NEXT:    [[TMP63:%.*]] = sub <4 x i32> [[TMP14]], [[TMP30]]
+; CHECK-NEXT:    [[TMP64:%.*]] = lshr <4 x i32> [[TMP63]], splat (i32 15)
+; CHECK-NEXT:    [[TMP65:%.*]] = and <4 x i32> [[TMP64]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP66:%.*]] = mul nuw <4 x i32> [[TMP65]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP67:%.*]] = add <4 x i32> [[TMP66]], [[TMP63]]
+; CHECK-NEXT:    [[TMP68:%.*]] = xor <4 x i32> [[TMP67]], [[TMP66]]
+; CHECK-NEXT:    [[TMP69:%.*]] = add <4 x i32> [[TMP32]], [[TMP16]]
+; CHECK-NEXT:    [[TMP70:%.*]] = lshr <4 x i32> [[TMP69]], splat (i32 15)
+; CHECK-NEXT:    [[TMP71:%.*]] = and <4 x i32> [[TMP70]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP72:%.*]] = mul nuw <4 x i32> [[TMP71]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP73:%.*]] = add <4 x i32> [[TMP72]], [[TMP69]]
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i32> [[TMP73]], [[TMP72]]
+; CHECK-NEXT:    [[TMP75:%.*]] = sub <4 x i32> [[TMP16]], [[TMP32]]
+; CHECK-NEXT:    [[TMP76:%.*]] = lshr <4 x i32> [[TMP75]], splat (i32 15)
+; CHECK-NEXT:    [[TMP77:%.*]] = and <4 x i32> [[TMP76]], splat (i32 65537)
+; CHECK-NEXT:    [[TMP78:%.*]] = mul nuw <4 x i32> [[TMP77]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP79:%.*]] = add <4 x i32> [[TMP78]], [[TMP75]]
+; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i32> [[TMP79]], [[TMP78]]
+; CHECK-NEXT:    [[TMP81:%.*]] = add <4 x i32> [[TMP74]], [[TMP80]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add <4 x i32> [[TMP81]], [[TMP68]]
+; CHECK-NEXT:    [[TMP83:%.*]] = add <4 x i32> [[TMP82]], [[TMP62]]
+; CHECK-NEXT:    [[TMP84:%.*]] = add <4 x i32> [[TMP83]], [[TMP44]]
+; CHECK-NEXT:    [[TMP85:%.*]] = add <4 x i32> [[TMP84]], [[TMP38]]
+; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i32> [[TMP85]], [[TMP56]]
+; CHECK-NEXT:    [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP50]]
+; CHECK-NEXT:    [[TMP88:%.*]] = and <4 x i32> [[TMP87]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP89:%.*]] = lshr <4 x i32> [[TMP87]], splat (i32 16)
+; CHECK-NEXT:    [[TMP90:%.*]] = add <4 x i32> [[TMP89]], zeroinitializer
+; CHECK-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP90]], [[TMP88]]
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
+; CHECK-NEXT:    [[TMP92:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP91]])
+; CHECK-NEXT:    ret i32 [[TMP92]]
 ;
 entry:
   %arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16

>From 9ad5ec3996418177b8e2057b03e03cd43daa8350 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 24 Mar 2025 08:06:47 +0000
Subject: [PATCH 3/5] Address comments

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 .../AArch64/low_trip_count_predicates.ll      | 258 ++++--------------
 2 files changed, 58 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 22ea2618324f4..a222489f77d89 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4025,7 +4025,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
   }
 
-  auto IsKnownModTripCountZero = [this, &UserIC](unsigned MaxVF) {
+  auto ScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
     unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
     ScalarEvolution *SE = PSE.getSE();
     // Currently only loops with countable exits are vectorized, but calling
@@ -4043,23 +4043,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return Rem->isZero();
   };
 
-  if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+  if (MaxPowerOf2RuntimeVF > 0) {
     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
            "MaxFixedVF must be a power of 2");
-    if (IsKnownModTripCountZero(*MaxPowerOf2RuntimeVF)) {
+    if (ScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
       // Accept MaxFixedVF if we do not have a tail.
       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
       return MaxFactors;
     }
   }
 
-  if (MaxTC && MaxTC <= TTI.getMinTripCountTailFoldingThreshold()) {
-    if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
+  auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
+  if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+    if (MaxPowerOf2RuntimeVF > 0) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width
       // factor in preference to allow the generation of a non-predicated loop.
       if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
-          IsKnownModTripCountZero(MaxFactors.FixedVF.getFixedValue())) {
+          ScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
         LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
                              "remain for any chosen VF.\n");
         MaxFactors.ScalableVF = ElementCount::getScalable(0);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 622f410f26f3c..1ebd56f92db6f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -478,233 +478,75 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK-LABEL: define i32 @tc4(
 ; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 16
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 32
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 48
-; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 64
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 80
-; CHECK-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 96
-; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP]], i64 112
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 0, 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = add i64 0, 0
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX2]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP11]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = sub <4 x i32> [[TMP5]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = sub <4 x i32> [[TMP6]], [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX30]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP17]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX33]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP19]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = add <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sub <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX46]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[ARRAYIDX49]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP26]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP28:%.*]] = sub <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
-; CHECK-NEXT:    [[TMP29:%.*]] = add <4 x i32> [[TMP27]], [[TMP21]]
-; CHECK-NEXT:    [[TMP30:%.*]] = sub <4 x i32> [[TMP21]], [[TMP27]]
-; CHECK-NEXT:    [[TMP31:%.*]] = add <4 x i32> [[TMP28]], [[TMP22]]
-; CHECK-NEXT:    [[TMP32:%.*]] = sub <4 x i32> [[TMP22]], [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = add <4 x i32> [[TMP29]], [[TMP13]]
-; CHECK-NEXT:    [[TMP34:%.*]] = lshr <4 x i32> [[TMP33]], splat (i32 15)
-; CHECK-NEXT:    [[TMP35:%.*]] = and <4 x i32> [[TMP34]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP36:%.*]] = mul nuw <4 x i32> [[TMP35]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP37:%.*]] = add <4 x i32> [[TMP36]], [[TMP33]]
-; CHECK-NEXT:    [[TMP38:%.*]] = xor <4 x i32> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = sub <4 x i32> [[TMP13]], [[TMP29]]
-; CHECK-NEXT:    [[TMP40:%.*]] = lshr <4 x i32> [[TMP39]], splat (i32 15)
-; CHECK-NEXT:    [[TMP41:%.*]] = and <4 x i32> [[TMP40]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP42:%.*]] = mul nuw <4 x i32> [[TMP41]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP43:%.*]] = add <4 x i32> [[TMP42]], [[TMP39]]
-; CHECK-NEXT:    [[TMP44:%.*]] = xor <4 x i32> [[TMP43]], [[TMP42]]
-; CHECK-NEXT:    [[TMP45:%.*]] = add <4 x i32> [[TMP31]], [[TMP15]]
-; CHECK-NEXT:    [[TMP46:%.*]] = lshr <4 x i32> [[TMP45]], splat (i32 15)
-; CHECK-NEXT:    [[TMP47:%.*]] = and <4 x i32> [[TMP46]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP48:%.*]] = mul nuw <4 x i32> [[TMP47]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP49:%.*]] = add <4 x i32> [[TMP48]], [[TMP45]]
-; CHECK-NEXT:    [[TMP50:%.*]] = xor <4 x i32> [[TMP49]], [[TMP48]]
-; CHECK-NEXT:    [[TMP51:%.*]] = sub <4 x i32> [[TMP15]], [[TMP31]]
-; CHECK-NEXT:    [[TMP52:%.*]] = lshr <4 x i32> [[TMP51]], splat (i32 15)
-; CHECK-NEXT:    [[TMP53:%.*]] = and <4 x i32> [[TMP52]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP54:%.*]] = mul nuw <4 x i32> [[TMP53]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP55:%.*]] = add <4 x i32> [[TMP54]], [[TMP51]]
-; CHECK-NEXT:    [[TMP56:%.*]] = xor <4 x i32> [[TMP55]], [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = add <4 x i32> [[TMP30]], [[TMP14]]
-; CHECK-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], splat (i32 15)
-; CHECK-NEXT:    [[TMP59:%.*]] = and <4 x i32> [[TMP58]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP60:%.*]] = mul nuw <4 x i32> [[TMP59]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP61:%.*]] = add <4 x i32> [[TMP60]], [[TMP57]]
-; CHECK-NEXT:    [[TMP62:%.*]] = xor <4 x i32> [[TMP61]], [[TMP60]]
-; CHECK-NEXT:    [[TMP63:%.*]] = sub <4 x i32> [[TMP14]], [[TMP30]]
-; CHECK-NEXT:    [[TMP64:%.*]] = lshr <4 x i32> [[TMP63]], splat (i32 15)
-; CHECK-NEXT:    [[TMP65:%.*]] = and <4 x i32> [[TMP64]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP66:%.*]] = mul nuw <4 x i32> [[TMP65]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP67:%.*]] = add <4 x i32> [[TMP66]], [[TMP63]]
-; CHECK-NEXT:    [[TMP68:%.*]] = xor <4 x i32> [[TMP67]], [[TMP66]]
-; CHECK-NEXT:    [[TMP69:%.*]] = add <4 x i32> [[TMP32]], [[TMP16]]
-; CHECK-NEXT:    [[TMP70:%.*]] = lshr <4 x i32> [[TMP69]], splat (i32 15)
-; CHECK-NEXT:    [[TMP71:%.*]] = and <4 x i32> [[TMP70]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP72:%.*]] = mul nuw <4 x i32> [[TMP71]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP73:%.*]] = add <4 x i32> [[TMP72]], [[TMP69]]
-; CHECK-NEXT:    [[TMP74:%.*]] = xor <4 x i32> [[TMP73]], [[TMP72]]
-; CHECK-NEXT:    [[TMP75:%.*]] = sub <4 x i32> [[TMP16]], [[TMP32]]
-; CHECK-NEXT:    [[TMP76:%.*]] = lshr <4 x i32> [[TMP75]], splat (i32 15)
-; CHECK-NEXT:    [[TMP77:%.*]] = and <4 x i32> [[TMP76]], splat (i32 65537)
-; CHECK-NEXT:    [[TMP78:%.*]] = mul nuw <4 x i32> [[TMP77]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP79:%.*]] = add <4 x i32> [[TMP78]], [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = xor <4 x i32> [[TMP79]], [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = add <4 x i32> [[TMP74]], [[TMP80]]
-; CHECK-NEXT:    [[TMP82:%.*]] = add <4 x i32> [[TMP81]], [[TMP68]]
-; CHECK-NEXT:    [[TMP83:%.*]] = add <4 x i32> [[TMP82]], [[TMP62]]
-; CHECK-NEXT:    [[TMP84:%.*]] = add <4 x i32> [[TMP83]], [[TMP44]]
-; CHECK-NEXT:    [[TMP85:%.*]] = add <4 x i32> [[TMP84]], [[TMP38]]
-; CHECK-NEXT:    [[TMP86:%.*]] = add <4 x i32> [[TMP85]], [[TMP56]]
-; CHECK-NEXT:    [[TMP87:%.*]] = add <4 x i32> [[TMP86]], [[TMP50]]
-; CHECK-NEXT:    [[TMP88:%.*]] = and <4 x i32> [[TMP87]], splat (i32 65535)
-; CHECK-NEXT:    [[TMP89:%.*]] = lshr <4 x i32> [[TMP87]], splat (i32 16)
-; CHECK-NEXT:    [[TMP90:%.*]] = add <4 x i32> [[TMP89]], zeroinitializer
-; CHECK-NEXT:    [[TMP91:%.*]] = add <4 x i32> [[TMP90]], [[TMP88]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> zeroinitializer, [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
-; CHECK-NEXT:    [[TMP92:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP91]])
-; CHECK-NEXT:    ret i32 [[TMP92]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
-  %arrayidx2 = getelementptr inbounds nuw i8, ptr %tmp, i64 16
-  %arrayidx11 = getelementptr inbounds nuw i8, ptr %tmp, i64 32
-  %arrayidx14 = getelementptr inbounds nuw i8, ptr %tmp, i64 48
-  %arrayidx30 = getelementptr inbounds nuw i8, ptr %tmp, i64 64
-  %arrayidx33 = getelementptr inbounds nuw i8, ptr %tmp, i64 80
-  %arrayidx46 = getelementptr inbounds nuw i8, ptr %tmp, i64 96
-  %arrayidx49 = getelementptr inbounds nuw i8, ptr %tmp, i64 112
   br label %for.body
 
 for.cond.cleanup:                                 ; preds = %for.body
-  %add89.lcssa = phi i32 [ %add89, %for.body ]
-  ret i32 %add89.lcssa
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
 
 for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %sum.0179 = phi i32 [ 0, %entry ], [ %add89, %for.body ]
+  %sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
   %arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
   %0 = load i32, ptr %arrayidx1, align 4
-  %arrayidx4 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx2, i64 0, i64 %indvars.iv
-  %1 = load i32, ptr %arrayidx4, align 4
-  %add = add i32 %1, %0
-  %sub = sub i32 %0, %1
-  %arrayidx13 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx11, i64 0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx13, align 4
-  %arrayidx16 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx14, i64 0, i64 %indvars.iv
-  %3 = load i32, ptr %arrayidx16, align 4
-  %add17 = add i32 %3, %2
-  %sub24 = sub i32 %2, %3
-  %add25 = add i32 %add17, %add
-  %sub26 = sub i32 %add, %add17
-  %add27 = add i32 %sub24, %sub
-  %sub28 = sub i32 %sub, %sub24
-  %arrayidx32 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx30, i64 0, i64 %indvars.iv
-  %4 = load i32, ptr %arrayidx32, align 4
-  %arrayidx35 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx33, i64 0, i64 %indvars.iv
-  %5 = load i32, ptr %arrayidx35, align 4
-  %add36 = add i32 %5, %4
-  %sub44 = sub i32 %4, %5
-  %arrayidx48 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx46, i64 0, i64 %indvars.iv
-  %6 = load i32, ptr %arrayidx48, align 4
-  %arrayidx51 = getelementptr inbounds nuw [4 x i32], ptr %arrayidx49, i64 0, i64 %indvars.iv
-  %7 = load i32, ptr %arrayidx51, align 4
-  %add52 = add i32 %7, %6
-  %sub60 = sub i32 %6, %7
-  %add61 = add i32 %add52, %add36
-  %sub62 = sub i32 %add36, %add52
-  %add63 = add i32 %sub60, %sub44
-  %sub64 = sub i32 %sub44, %sub60
-  %add65 = add i32 %add61, %add25
-  %shr.i173 = lshr i32 %add65, 15
-  %and.i174 = and i32 %shr.i173, 65537
-  %mul.i175 = mul nuw i32 %and.i174, 65535
-  %add.i176 = add i32 %mul.i175, %add65
-  %xor.i177 = xor i32 %add.i176, %mul.i175
-  %sub66 = sub i32 %add25, %add61
-  %shr.i168 = lshr i32 %sub66, 15
-  %and.i169 = and i32 %shr.i168, 65537
-  %mul.i170 = mul nuw i32 %and.i169, 65535
-  %add.i171 = add i32 %mul.i170, %sub66
-  %xor.i172 = xor i32 %add.i171, %mul.i170
-  %add69 = add i32 %add63, %add27
-  %shr.i163 = lshr i32 %add69, 15
-  %and.i164 = and i32 %shr.i163, 65537
-  %mul.i165 = mul nuw i32 %and.i164, 65535
-  %add.i166 = add i32 %mul.i165, %add69
-  %xor.i167 = xor i32 %add.i166, %mul.i165
-  %sub71 = sub i32 %add27, %add63
-  %shr.i158 = lshr i32 %sub71, 15
-  %and.i159 = and i32 %shr.i158, 65537
-  %mul.i160 = mul nuw i32 %and.i159, 65535
-  %add.i161 = add i32 %mul.i160, %sub71
-  %xor.i162 = xor i32 %add.i161, %mul.i160
-  %add75 = add i32 %sub62, %sub26
-  %shr.i153 = lshr i32 %add75, 15
-  %and.i154 = and i32 %shr.i153, 65537
-  %mul.i155 = mul nuw i32 %and.i154, 65535
-  %add.i156 = add i32 %mul.i155, %add75
-  %xor.i157 = xor i32 %add.i156, %mul.i155
-  %sub77 = sub i32 %sub26, %sub62
-  %shr.i148 = lshr i32 %sub77, 15
-  %and.i149 = and i32 %shr.i148, 65537
-  %mul.i150 = mul nuw i32 %and.i149, 65535
-  %add.i151 = add i32 %mul.i150, %sub77
-  %xor.i152 = xor i32 %add.i151, %mul.i150
-  %add81 = add i32 %sub64, %sub28
-  %shr.i143 = lshr i32 %add81, 15
-  %and.i144 = and i32 %shr.i143, 65537
-  %mul.i145 = mul nuw i32 %and.i144, 65535
-  %add.i146 = add i32 %mul.i145, %add81
-  %xor.i147 = xor i32 %add.i146, %mul.i145
-  %sub83 = sub i32 %sub28, %sub64
-  %shr.i = lshr i32 %sub83, 15
-  %and.i = and i32 %shr.i, 65537
-  %mul.i = mul nuw i32 %and.i, 65535
-  %add.i = add i32 %mul.i, %sub83
-  %xor.i = xor i32 %add.i, %mul.i
-  %add73 = add i32 %xor.i147, %xor.i
-  %add68 = add i32 %add73, %xor.i152
-  %add74 = add i32 %add68, %xor.i157
-  %add79 = add i32 %add74, %xor.i172
-  %add80 = add i32 %add79, %xor.i177
-  %add85 = add i32 %add80, %xor.i162
-  %add86 = add i32 %add85, %xor.i167
-  %conv87 = and i32 %add86, 65535
-  %shr = lshr i32 %add86, 16
-  %add88 = add i32 %shr, %sum.0179
-  %add89 = add i32 %add88, %conv87
+  %add = add i32 %sum.0179, %0
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, 4
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This has a trip-count of 4 from a profile.
+define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) vscale_range(1,16) {
+; CHECK-LABEL: define i32 @tc4_from_profile(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP:.*]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    ret i32 [[TMP4]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF7:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx1, align 4
+  %add = add i32 %sum.0179, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !prof !2
+}
+
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!2 = !{!"branch_weights", i32 10, i32 30}
+
 ;.
 ; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -713,6 +555,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
 ; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+; CHECK-VS1: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
 ;.
 ; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -721,4 +564,5 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
 ; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
+; CHECK-VS2: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
 ;.

>From b06ca2e821374e557f78a2acabccf3c988767e6f Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 26 Mar 2025 11:17:35 +0000
Subject: [PATCH 4/5] Remove simplify-cfg

---
 .../AArch64/low_trip_count_predicates.ll      | 90 ++++++++++++++-----
 1 file changed, 67 insertions(+), 23 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index 1ebd56f92db6f..4231658db8726 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; REQUIRES: asserts
-; RUN: opt -S < %s -p "loop-vectorize,simplifycfg" -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS1
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS1
 ; RUN: cat %t | FileCheck %s --check-prefixes=DEBUG,DEBUG-VS1
-; RUN: opt -S < %s -p "loop-vectorize,simplifycfg" -debug-only=loop-vectorize -mcpu=neoverse-v1 -sve-tail-folding=disabled 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS2
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mcpu=neoverse-v1 -sve-tail-folding=disabled 2>%t | FileCheck %s --check-prefixes=CHECK,CHECK-VS2
 ; RUN: cat %t | FileCheck %s --check-prefixes=DEBUG,DEBUG-VS2
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -90,7 +90,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VS1:       [[MIDDLE_BLOCK]]:
 ; CHECK-VS1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS1-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK-VS1-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS1:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-VS1-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
@@ -123,7 +123,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VS1:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-VS1-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
-; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VS1-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS1:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-VS1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ]
 ; CHECK-VS1-NEXT:    br label %[[WHILE_BODY:.*]]
@@ -136,7 +136,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-VS1-NEXT:    [[TMP38:%.*]] = and i64 [[IV_NEXT]], 4294967295
 ; CHECK-VS1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP38]], 19
-; CHECK-VS1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VS1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VS1:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-VS1-NEXT:    br label %[[WHILE_END]]
 ; CHECK-VS1:       [[WHILE_END]]:
 ; CHECK-VS1-NEXT:    ret void
 ;
@@ -194,7 +196,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VS2:       [[MIDDLE_BLOCK]]:
 ; CHECK-VS2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-VS2-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK-VS2-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS2:       [[VEC_EPILOG_ITER_CHECK]]:
 ; CHECK-VS2-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
@@ -227,7 +229,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VS2:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-VS2-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
-; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK-VS2-NEXT:    br i1 [[CMP_N10]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-VS2:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-VS2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP39]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ]
 ; CHECK-VS2-NEXT:    br label %[[WHILE_BODY:.*]]
@@ -240,7 +242,9 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-VS2-NEXT:    [[TMP38:%.*]] = and i64 [[IV_NEXT]], 4294967295
 ; CHECK-VS2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP38]], 19
-; CHECK-VS2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VS2-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VS2:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-VS2-NEXT:    br label %[[WHILE_END]]
 ; CHECK-VS2:       [[WHILE_END]]:
 ; CHECK-VS2-NEXT:    ret void
 ;
@@ -289,7 +293,9 @@ define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -347,7 +353,9 @@ define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture nou
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -398,6 +406,8 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TC]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 1028, [[TMP20]]
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TC]], 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub i64 1027, [[TMP22]]
@@ -406,7 +416,7 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295
 ; CHECK-NEXT:    [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    br i1 [[TMP28]], label %[[WHILE_BODY:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
@@ -434,9 +444,14 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[WHILE_END]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
 ; CHECK:       [[WHILE_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
@@ -444,7 +459,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %
 ; CHECK-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -477,15 +494,38 @@ while.end:
 define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
 ; CHECK-LABEL: define i32 @tc4(
 ; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = add i64 0, 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> zeroinitializer, [[WIDE_LOAD]]
-; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw i64 0, 4
+; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    ret i32 [[TMP4]]
+; CHECK-NEXT:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[ADD_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_0179:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD]] = add i32 [[SUM_0179]], [[TMP5]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -522,7 +562,7 @@ define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) v
 ; CHECK-NEXT:    [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -555,7 +595,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
 ; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
-; CHECK-VS1: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
+; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
 ;.
 ; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -564,5 +606,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
 ; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
-; CHECK-VS2: [[PROF7]] = !{!"branch_weights", i32 10, i32 30}
+; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
+; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
 ;.

>From 76bf30d9f994dc5ba3bb26aef4614d23a41aa24c Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 26 Mar 2025 11:35:29 +0000
Subject: [PATCH 5/5] ScalarEpilogueNeeded -> NoScalarEpilogueNeeded

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a222489f77d89..1810298b1b619 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4025,7 +4025,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
   }
 
-  auto ScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
+  auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
     unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
     ScalarEvolution *SE = PSE.getSE();
     // Currently only loops with countable exits are vectorized, but calling
@@ -4046,7 +4046,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (MaxPowerOf2RuntimeVF > 0) {
     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
            "MaxFixedVF must be a power of 2");
-    if (ScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
+    if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
       // Accept MaxFixedVF if we do not have a tail.
       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
       return MaxFactors;
@@ -4060,7 +4060,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
       // the trip count but the scalable factor does not, use the fixed-width
       // factor in preference to allow the generation of a non-predicated loop.
       if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
-          ScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
+          NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
         LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
                              "remain for any chosen VF.\n");
         MaxFactors.ScalableVF = ElementCount::getScalable(0);



More information about the llvm-commits mailing list