[llvm] [IVDescriptors] Identify min/max recurrences in single pass. (PR #163460)

Sun Feb 22 11:16:32 PST 2026

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/163460

>From 73f6158d73960bbfd3addf5eec62bfe559afaee9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 13 Nov 2025 18:23:36 +0000
Subject: [PATCH 1/8] [LV] Add additional tests for reductions with
 intermediate stores.

Adds missing test coverage for reductions with intermediate stores,
including partial reductions and additional intermediate stores.
---
 .../partial-reduce-dot-product-neon.ll        | 124 +++++++++++++++++-
 .../reduction-with-invariant-store.ll         |  91 +++++++++++++
 2 files changed, 214 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index ad92b56218bb5..e6f8439d35779 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1889,6 +1889,7 @@ for.exit:                        ; preds = %for.body
   ret i32 %result
 }
 
+<<<<<<< HEAD
 define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_ext_mul(
 ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
@@ -2394,7 +2395,128 @@ loop:
   %ec = icmp eq i64 %iv, %n
   br i1 %ec, label %exit, label %loop
 
-exit:
+for.exit:
+  ret i32 %add
+}
+
+define i32 @zext_add_reduc_i8_i32_store_invariant(ptr %a, ptr %dst) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @zext_add_reduc_i8_i32_store_invariant(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 42
+; CHECK-INTERLEAVE1-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.memcheck:
+; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 172
+; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 1024
+; CHECK-INTERLEAVE1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_DST]], [[SCEVGEP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; CHECK-INTERLEAVE1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = add <16 x i32> [[TMP1]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_store_invariant(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 42
+; CHECK-INTERLEAVED-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.memcheck:
+; CHECK-INTERLEAVED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 172
+; CHECK-INTERLEAVED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 1024
+; CHECK-INTERLEAVED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_DST]], [[SCEVGEP1]]
+; CHECK-INTERLEAVED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; CHECK-INTERLEAVED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1, !alias.scope [[META11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    store i32 [[TMP7]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_store_invariant(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 42
+; CHECK-MAXBW-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.memcheck:
+; CHECK-MAXBW-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 172
+; CHECK-MAXBW-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 1024
+; CHECK-MAXBW-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[GEP_DST]], [[SCEVGEP1]]
+; CHECK-MAXBW-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; CHECK-MAXBW-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-MAXBW-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <16 x i32> [[TMP1]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
+; CHECK-MAXBW-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       scalar.ph:
+;
+entry:
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
+  store i32 0, ptr %gep.dst, align 4
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %add = add i32 %ext.a, %accum
+  store i32 %add, ptr %gep.dst, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:
   ret i32 %add
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
index dfdf1100eb57b..817c0f10a1a50 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -1152,3 +1152,94 @@ for.body:
 exit:
   ret void
 }
+
+define void @test_phi_smax_used_by_intermediate_store(i32 %x, ptr %src, ptr %dst) {
+; CHECK-LABEL: define void @test_phi_smax_used_by_intermediate_store(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 404
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META80:![0-9]+]]
+; CHECK-NEXT:    [[TMP1]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    store i32 [[TMP3]], ptr [[DST]], align 8, !alias.scope [[META84:![0-9]+]], !noalias [[META80]]
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %max.0 = tail call i32 @llvm.smax.i32(i32 %red, i32 %x)
+  store i32 %max.0, ptr %dst, align 8
+  %red.next = tail call i32 @llvm.smax.i32(i32 %red, i32 %l)
+  store i32 %red.next, ptr %dst, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_phi_smax_used_by_variant_store(i32 %x, ptr %src, ptr %dst.0, ptr %dst.1) {
+; CHECK-LABEL: define void @test_phi_smax_used_by_variant_store(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[SRC:%.*]], ptr [[DST_0:%.*]], ptr [[DST_1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[MAX_0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[RED]], i32 [[X]])
+; CHECK-NEXT:    [[GEP_DST_0:%.*]] = getelementptr inbounds i32, ptr [[DST_0]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MAX_0]], ptr [[DST_0]], align 8
+; CHECK-NEXT:    [[RED_NEXT]] = tail call i32 @llvm.smax.i32(i32 [[RED]], i32 [[L]])
+; CHECK-NEXT:    store i32 [[RED_NEXT]], ptr [[DST_1]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 100
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src
+  %max.0  = tail call i32 @llvm.smax.i32(i32 %red, i32 %x)
+  %gep.dst.0 = getelementptr inbounds i32, ptr %dst.0, i64 %iv
+  store i32 %max.0, ptr %dst.0, align 8
+  %red.next  = tail call i32 @llvm.smax.i32(i32 %red, i32 %l)
+  store i32 %red.next, ptr %dst.1, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, 100
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

>From 14b46aa1ec680beb9a4e1aed82aadd360ed69fdb Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 2 Oct 2025 17:12:57 +0100
Subject: [PATCH 2/8] [IVDescriptors] Identify min/max recurrences in single
 pass.

This patch ties to extend the approach from
https://github.com/llvm/llvm-project/pull/141431 to all min/max
recurrence kinds.

This patch adds a new getMinMaxRecurrence that identifies all min/max
recurrences in a single pass. It starts at the backedge value of a phi
and tries to identify the kind of the min/max recurrences.

It then walks from the backedge value to its operands recursively until
it reaches out-of-loop values or the phi.

Then users of both the backedge value and all instructions in the chain
from backedge value to phi are checked.

This consolidates all logic to identify min/max recurrences to a single
function, and avoids the need to try to identify each min/max reduction
kind individually.
---
 llvm/include/llvm/Analysis/IVDescriptors.h |   8 -
 llvm/lib/Analysis/IVDescriptors.cpp        | 354 +++++++++++----------
 2 files changed, 182 insertions(+), 180 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 05c17632e0e49..68528b0719f77 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -163,14 +163,6 @@ class RecurrenceDescriptor {
   LLVM_ABI static bool areAllUsesIn(Instruction *I,
                                     SmallPtrSetImpl<Instruction *> &Set);
 
-  /// Returns a struct describing if the instruction is a llvm.(s/u)(min/max),
-  /// llvm.minnum/maxnum or a Select(ICmp(X, Y), X, Y) pair of instructions
-  /// corresponding to a min(X, Y) or max(X, Y), matching the recurrence kind \p
-  /// Kind. \p Prev specifies the description of an already processed select
-  /// instruction, so its corresponding cmp can be matched to it.
-  LLVM_ABI static InstDesc isMinMaxPattern(Instruction *I, RecurKind Kind,
-                                           const InstDesc &Prev);
-
   /// Returns a struct describing whether the instruction is either a
   ///   Select(ICmp(A, B), X, Y), or
   ///   Select(FCmp(A, B), X, Y)
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 89490fee41754..518e7ac0c71f7 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -218,6 +218,7 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   return true;
 }
 
+<<<<<<< HEAD
 /// Returns true if \p Phi is a min/max reduction matching \p Kind where \p Phi
 /// is used outside the reduction chain. This is common for loops selecting the
 /// index of a minimum/maximum value (argmin/argmax).
@@ -264,6 +265,173 @@ static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
   return true;
 }
 
+// Helper to collect FMF from a value and its associated fcmp in select patterns
+static FastMathFlags collectMinMaxFMF(Value *V) {
+  FastMathFlags FMF = cast<FPMathOperator>(V)->getFastMathFlags();
+  if (auto *Sel = dyn_cast<SelectInst>(V)) {
+    // Accept FMF on either fcmp or select of a min/max idiom.
+    // TODO: This is a hack to work-around the fact that FMF may not be
+    //       assigned/propagated correctly. If that problem is fixed or we
+    //       standardize on fmin/fmax via intrinsics, this can be removed.
+    if (auto *FCmp = dyn_cast<FCmpInst>(Sel->getCondition()))
+      FMF |= FCmp->getFastMathFlags();
+  }
+  return FMF;
+}
+
+static std::optional<FastMathFlags>
+hasRequiredFastMathFlags(FPMathOperator *FPOp, RecurKind &RK,
+                         FastMathFlags FuncFMF) {
+  bool HasRequiredFMF =
+      (FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) ||
+      (FPOp && FPOp->hasNoNaNs() && FPOp->hasNoSignedZeros()) ||
+      RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+      RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum;
+  if (!HasRequiredFMF) {
+    if (RK == RecurKind::FMax &&
+        match(FPOp, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      RK = RecurKind::FMaxNum;
+    else if (RK == RecurKind::FMin &&
+             match(FPOp, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+      RK = RecurKind::FMinNum;
+    else
+      return std::nullopt;
+  }
+  return {collectMinMaxFMF(FPOp)};
+}
+
+static std::optional<RecurrenceDescriptor>
+getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
+                  ScalarEvolution *SE) {
+  if (Phi->getNumIncomingValues() != 2 ||
+      Phi->getParent() != TheLoop->getHeader())
+    return std::nullopt;
+
+  Type *Ty = Phi->getType();
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || !Latch)
+    return std::nullopt;
+
+  auto Matches = [](Value *V, Value *&A, Value *&B) -> RecurKind {
+    if (match(V, m_UMin(m_Value(A), m_Value(B))))
+      return RecurKind::UMin;
+    if (match(V, m_UMax(m_Value(A), m_Value(B))))
+      return RecurKind::UMax;
+    if (match(V, m_SMax(m_Value(A), m_Value(B))))
+      return RecurKind::SMax;
+    if (match(V, m_SMin(m_Value(A), m_Value(B))))
+      return RecurKind::SMin;
+    if (match(V, m_OrdOrUnordFMin(m_Value(A), m_Value(B))) ||
+        match(V, m_Intrinsic<Intrinsic::minnum>(m_Value(A), m_Value(B))))
+      return RecurKind::FMin;
+    if (match(V, m_OrdOrUnordFMax(m_Value(A), m_Value(B))) ||
+        match(V, m_Intrinsic<Intrinsic::maxnum>(m_Value(A), m_Value(B))))
+      return RecurKind::FMax;
+    if (match(V, m_FMinimum(m_Value(A), m_Value(B))))
+      return RecurKind::FMinimum;
+    if (match(V, m_FMaximum(m_Value(A), m_Value(B))))
+      return RecurKind::FMaximum;
+    if (match(V, m_Intrinsic<Intrinsic::minimumnum>(m_Value(A), m_Value(B))))
+      return RecurKind::FMinimumNum;
+    if (match(V, m_Intrinsic<Intrinsic::maximumnum>(m_Value(A), m_Value(B))))
+      return RecurKind::FMaximumNum;
+    return RecurKind::None;
+  };
+
+  FastMathFlags FMF = FastMathFlags::getFast();
+  Value *RdxNext = Phi->getIncomingValueForBlock(Latch);
+  RecurKind RK = RecurKind::None;
+  // Identify min/max recurrences by walking the def-use chains upwards,
+  // starting at RdxNext.
+  SmallVector<Value *> WorkList = {RdxNext};
+  SmallPtrSet<Value *, 8> Chain = {Phi};
+  while (!WorkList.empty()) {
+    Value *Cur = WorkList.pop_back_val();
+    if (!Chain.insert(Cur).second)
+      continue;
+    auto *I = dyn_cast<Instruction>(Cur);
+    if (!I || !TheLoop->contains(I))
+      return std::nullopt;
+    if (auto *PN = dyn_cast<PHINode>(I)) {
+      if (PN != Phi)
+        append_range(WorkList, PN->operands());
+      continue;
+    }
+    Value *A, *B;
+    RecurKind CurRK = Matches(Cur, A, B);
+    if (CurRK == RecurKind::None || (RK != RecurKind::None && CurRK != RK))
+      return std::nullopt;
+
+    RK = CurRK;
+    // For floating point recurrences, check we have the required fast-math
+    // flags.
+    if (RecurrenceDescriptor::isFPMinMaxRecurrenceKind(CurRK)) {
+      if (auto CurFMF =
+              hasRequiredFastMathFlags(cast<FPMathOperator>(Cur), RK, FuncFMF))
+        FMF &= *CurFMF;
+      else
+        return std::nullopt;
+    }
+
+    Chain.insert(I);
+    if (auto *SI = dyn_cast<SelectInst>(I))
+      Chain.insert(SI->getCondition());
+
+    if (A == Phi || B == Phi)
+      continue;
+
+    // Add operand to worklist if it matches the pattern - exactly one must
+    // match
+    Value *X, *Y;
+    auto *IA = dyn_cast<Instruction>(A);
+    auto *IB = dyn_cast<Instruction>(B);
+    bool AMatches = IA && TheLoop->contains(IA) && Matches(A, X, Y) == RK;
+    bool BMatches = IB && TheLoop->contains(IB) && Matches(B, X, Y) == RK;
+    if (AMatches == BMatches) // Both or neither match
+      return std::nullopt;
+    WorkList.push_back(AMatches ? A : B);
+  }
+
+  // Check users of RdxNext. It can have
+  // * a single user outside the loop,
+  // * used stores to the same invariant address,
+  // * used by the starting recurrence phis.
+  unsigned IncOut = 0;
+  StoreInst *IntermediateStore = nullptr;
+  for (Use &U : RdxNext->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+    if (!TheLoop->contains(User->getParent())) {
+      if (++IncOut > 1)
+        return std::nullopt;
+    } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+      const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
+      if (U.getOperandNo() == SI->getPointerOperandIndex() ||
+          !SE->isLoopInvariant(Ptr, TheLoop) ||
+          (IntermediateStore &&
+           SE->getSCEV(IntermediateStore->getPointerOperand()) != Ptr))
+        return std::nullopt;
+      // Keep the store that appears last in the block, as it will be the final
+      // reduction value.
+      if (!IntermediateStore || IntermediateStore->comesBefore(SI))
+        IntermediateStore = SI;
+    } else if (Phi != User)
+      return std::nullopt;
+  }
+
+  // All ops on the chain from Phi to RdxNext must only be used by instructions
+  // in the chain.
+  for (Value *Op : Chain)
+    if (Op != RdxNext &&
+        any_of(Op->users(), [&Chain](User *U) { return !Chain.contains(U); }))
+      return std::nullopt;
+
+  SmallPtrSet<Instruction *, 4> Casts;
+  return RecurrenceDescriptor(
+      Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),
+      cast<Instruction>(RdxNext), IntermediateStore, RK, FMF, nullptr,
+      Phi->getType(), false, false, Casts, -1U);
+}
+
 bool RecurrenceDescriptor::AddReductionVar(
     PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF,
     RecurrenceDescriptor &RedDes, DemandedBits *DB, AssumptionCache *AC,
@@ -304,9 +472,8 @@ bool RecurrenceDescriptor::AddReductionVar(
   // must include the original PHI.
   bool FoundStartPHI = false;
 
-  // To recognize min/max patterns formed by a icmp select sequence, we store
-  // the number of instruction we saw from the recognized min/max pattern,
-  //  to make sure we only see exactly the two instructions.
+  // To recognize AnyOf patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw to make sure we only see one.
   unsigned NumCmpSelectPatternInst = 0;
   InstDesc ReduxDesc(false, nullptr);
 
@@ -333,8 +500,7 @@ bool RecurrenceDescriptor::AddReductionVar(
   } else if (ScalarTy->isIntegerTy()) {
     if (!isIntegerRecurrenceKind(Kind))
       return false;
-    if (!isMinMaxRecurrenceKind(Kind))
-      Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
+    Start = lookThroughAnd(Phi, RecurrenceType, VisitedInsts, CastInsts);
   } else {
     // Pointer min/max may exist, but it is not supported as a reduction op.
     return false;
@@ -441,18 +607,8 @@ bool RecurrenceDescriptor::AddReductionVar(
       if (!ReduxDesc.isRecurrence())
         return false;
       // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
-      if (isa<FPMathOperator>(ReduxDesc.getPatternInst()) && !IsAPhi) {
-        FastMathFlags CurFMF = ReduxDesc.getPatternInst()->getFastMathFlags();
-        if (auto *Sel = dyn_cast<SelectInst>(ReduxDesc.getPatternInst())) {
-          // Accept FMF on either fcmp or select of a min/max idiom.
-          // TODO: This is a hack to work-around the fact that FMF may not be
-          //       assigned/propagated correctly. If that problem is fixed or we
-          //       standardize on fmin/fmax via intrinsics, this can be removed.
-          if (auto *FCmp = dyn_cast<FCmpInst>(Sel->getCondition()))
-            CurFMF |= FCmp->getFastMathFlags();
-        }
-        FMF &= CurFMF;
-      }
+      if (isa<FPMathOperator>(ReduxDesc.getPatternInst()) && !IsAPhi)
+        FMF &= collectMinMaxFMF(ReduxDesc.getPatternInst());
       // Update this reduction kind if we matched a new instruction.
       // TODO: Can we eliminate the need for a 2nd InstDesc by keeping 'Kind'
       //       state accurate while processing the worklist?
@@ -469,18 +625,14 @@ bool RecurrenceDescriptor::AddReductionVar(
       return false;
 
     // A reduction operation must only have one use of the reduction value.
-    if (!IsAPhi && !IsASelect && !isMinMaxRecurrenceKind(Kind) &&
-        !isAnyOfRecurrenceKind(Kind) && hasMultipleUsesOf(Cur, VisitedInsts, 1))
+    if (!IsAPhi && !IsASelect && !isAnyOfRecurrenceKind(Kind) &&
+        hasMultipleUsesOf(Cur, VisitedInsts, 1))
       return false;
 
     // All inputs to a PHI node must be a reduction value.
     if (IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
       return false;
 
-    if (isIntMinMaxRecurrenceKind(Kind) && (isa<ICmpInst>(Cur) || IsASelect))
-      ++NumCmpSelectPatternInst;
-    if (isFPMinMaxRecurrenceKind(Kind) && (isa<FCmpInst>(Cur) || IsASelect))
-      ++NumCmpSelectPatternInst;
     if (isAnyOfRecurrenceKind(Kind) && IsASelect)
       ++NumCmpSelectPatternInst;
 
@@ -527,7 +679,7 @@ bool RecurrenceDescriptor::AddReductionVar(
       }
 
       // Process instructions only once (termination). Each reduction cycle
-      // value must only be used once, except by phi nodes and min/max
+      // value must only be used once, except by phi nodes and conditional
       // reductions which are represented as a cmp followed by a select.
       InstDesc IgnoredVal(false, nullptr);
       if (VisitedInsts.insert(UI).second) {
@@ -543,12 +695,9 @@ bool RecurrenceDescriptor::AddReductionVar(
           NonPHIs.push_back(UI);
         }
       } else if (!isa<PHINode>(UI) &&
-                 ((!isa<FCmpInst>(UI) && !isa<ICmpInst>(UI) &&
-                   !isa<SelectInst>(UI)) ||
-                  (!isConditionalRdxPattern(UI).isRecurrence() &&
+                 ((!isConditionalRdxPattern(UI).isRecurrence() &&
                    !isAnyOfPattern(TheLoop, Phi, UI, IgnoredVal)
-                        .isRecurrence() &&
-                   !isMinMaxPattern(UI, Kind, IgnoredVal).isRecurrence())))
+                        .isRecurrence())))
         return false;
 
       // Remember that we completed the cycle.
@@ -559,13 +708,6 @@ bool RecurrenceDescriptor::AddReductionVar(
     Worklist.append(NonPHIs.begin(), NonPHIs.end());
   }
 
-  // This means we have seen one but not the other instruction of the
-  // pattern or more than just a select and cmp. Zero implies that we saw a
-  // llvm.min/max intrinsic, which is always OK.
-  if (isMinMaxRecurrenceKind(Kind) && NumCmpSelectPatternInst != 2 &&
-      NumCmpSelectPatternInst != 0)
-    return false;
-
   if (isAnyOfRecurrenceKind(Kind) && NumCmpSelectPatternInst != 1)
     return false;
 
@@ -851,55 +993,6 @@ RecurrenceDescriptor::isFindPattern(Loop *TheLoop, PHINode *OrigPhi,
   return InstDesc(I, RecurKind::FindLast);
 }
 
-RecurrenceDescriptor::InstDesc
-RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
-                                      const InstDesc &Prev) {
-  assert((isa<CmpInst>(I) || isa<SelectInst>(I) || isa<CallInst>(I)) &&
-         "Expected a cmp or select or call instruction");
-  if (!isMinMaxRecurrenceKind(Kind))
-    return InstDesc(false, I);
-
-  // We must handle the select(cmp()) as a single instruction. Advance to the
-  // select.
-  if (match(I, m_OneUse(m_Cmp()))) {
-    if (auto *Select = dyn_cast<SelectInst>(*I->user_begin()))
-      return InstDesc(Select, Prev.getRecKind());
-  }
-
-  // Only match select with single use cmp condition, or a min/max intrinsic.
-  if (!isa<IntrinsicInst>(I) &&
-      !match(I, m_Select(m_OneUse(m_Cmp()), m_Value(), m_Value())))
-    return InstDesc(false, I);
-
-  // Look for a min/max pattern.
-  if (match(I, m_UMin(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::UMin, I);
-  if (match(I, m_UMax(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::UMax, I);
-  if (match(I, m_SMax(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::SMax, I);
-  if (match(I, m_SMin(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::SMin, I);
-  if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMin, I);
-  if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMax, I);
-  if (match(I, m_FMinNum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMin, I);
-  if (match(I, m_FMaxNum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMax, I);
-  if (match(I, m_FMinimumNum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMinimumNum, I);
-  if (match(I, m_FMaximumNum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMaximumNum, I);
-  if (match(I, m_FMinimum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMinimum, I);
-  if (match(I, m_FMaximum(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMaximum, I);
-
-  return InstDesc(false, I);
-}
-
 /// Returns true if the select instruction has users in the compare-and-add
 /// reduction pattern below. The select instruction argument is the last one
 /// in the sequence.
@@ -990,43 +1083,6 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
   case Instruction::Call:
     if (isAnyOfRecurrenceKind(Kind))
       return isAnyOfPattern(L, OrigPhi, I, Prev);
-    auto HasRequiredFMF = [&]() {
-     if (FuncFMF.noNaNs() && FuncFMF.noSignedZeros())
-       return true;
-     if (isa<FPMathOperator>(I) && I->hasNoNaNs() && I->hasNoSignedZeros())
-       return true;
-     // minimum/minnum and maximum/maxnum intrinsics do not require nsz and nnan
-     // flags since NaN and signed zeroes are propagated in the intrinsic
-     // implementation.
-     return match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())) ||
-            match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())) ||
-            match(I,
-                  m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
-            match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
-    };
-    if (isIntMinMaxRecurrenceKind(Kind))
-      return isMinMaxPattern(I, Kind, Prev);
-    if (isFPMinMaxRecurrenceKind(Kind)) {
-      InstDesc Res = isMinMaxPattern(I, Kind, Prev);
-      if (!Res.isRecurrence())
-        return InstDesc(false, I);
-      if (HasRequiredFMF())
-        return Res;
-      // We may be able to vectorize FMax/FMin reductions using maxnum/minnum
-      // intrinsics with extra checks ensuring the vector loop handles only
-      // non-NaN inputs.
-      if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
-        assert(Kind == RecurKind::FMax &&
-               "unexpected recurrence kind for maxnum");
-        return InstDesc(I, RecurKind::FMaxNum);
-      }
-      if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
-        assert(Kind == RecurKind::FMin &&
-               "unexpected recurrence kind for minnum");
-        return InstDesc(I, RecurKind::FMinNum);
-      }
-      return InstDesc(false, I);
-    }
     if (isFMulAddIntrinsic(I))
       return InstDesc(Kind == RecurKind::FMulAdd, I,
                       I->hasAllowReassoc() ? nullptr : I);
@@ -1097,24 +1153,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::SMax, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a SMAX reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::SMin, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a SMIN reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::UMax, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a UMAX reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::UMin, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a UMIN reduction PHI." << *Phi << "\n");
+  if (auto RD = getMultiUseMinMax(Phi, TheLoop, FMF, SE)) {
+    LLVM_DEBUG(dbgs() << "Found a min/max reduction PHI." << *Phi << "\n");
+    RedDes = *RD;
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::AnyOf, TheLoop, FMF, RedDes, DB, AC, DT,
@@ -1138,43 +1179,12 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found an FAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMax, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MAX reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::FMin, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MIN reduction PHI." << *Phi << "\n");
-    return true;
-  }
   if (AddReductionVar(Phi, RecurKind::FMulAdd, TheLoop, FMF, RedDes, DB, AC, DT,
                       SE)) {
     LLVM_DEBUG(dbgs() << "Found an FMulAdd reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (AddReductionVar(Phi, RecurKind::FMaximum, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MAXIMUM reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::FMinimum, TheLoop, FMF, RedDes, DB, AC, DT,
-                      SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MINIMUM reduction PHI." << *Phi << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::FMaximumNum, TheLoop, FMF, RedDes, DB, AC,
-                      DT, SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MAXIMUMNUM reduction PHI." << *Phi
-                      << "\n");
-    return true;
-  }
-  if (AddReductionVar(Phi, RecurKind::FMinimumNum, TheLoop, FMF, RedDes, DB, AC,
-                      DT, SE)) {
-    LLVM_DEBUG(dbgs() << "Found a float MINIMUMNUM reduction PHI." << *Phi
-                      << "\n");
-    return true;
-  }
+
   // Not a reduction of known type.
   return false;
 }

>From b861d3c0c30617941aeefe4bb93fae1b24903934 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 5 Nov 2025 15:26:32 +0000
Subject: [PATCH 3/8] !fixup address comments, thanks

---
 llvm/lib/Analysis/IVDescriptors.cpp | 37 ++++++++++++++++-------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 518e7ac0c71f7..3cde8c78e430d 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -300,17 +300,17 @@ hasRequiredFastMathFlags(FPMathOperator *FPOp, RecurKind &RK,
   return {collectMinMaxFMF(FPOp)};
 }
 
-static std::optional<RecurrenceDescriptor>
-getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
-                  ScalarEvolution *SE) {
+static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
+                                                FastMathFlags FuncFMF,
+                                                ScalarEvolution *SE) {
   if (Phi->getNumIncomingValues() != 2 ||
       Phi->getParent() != TheLoop->getHeader())
-    return std::nullopt;
+    return {};
 
   Type *Ty = Phi->getType();
   BasicBlock *Latch = TheLoop->getLoopLatch();
   if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || !Latch)
-    return std::nullopt;
+    return {};
 
   auto Matches = [](Value *V, Value *&A, Value *&B) -> RecurKind {
     if (match(V, m_UMin(m_Value(A), m_Value(B))))
@@ -351,7 +351,7 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
       continue;
     auto *I = dyn_cast<Instruction>(Cur);
     if (!I || !TheLoop->contains(I))
-      return std::nullopt;
+      return {};
     if (auto *PN = dyn_cast<PHINode>(I)) {
       if (PN != Phi)
         append_range(WorkList, PN->operands());
@@ -360,7 +360,7 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
     Value *A, *B;
     RecurKind CurRK = Matches(Cur, A, B);
     if (CurRK == RecurKind::None || (RK != RecurKind::None && CurRK != RK))
-      return std::nullopt;
+      return {};
 
     RK = CurRK;
     // For floating point recurrences, check we have the required fast-math
@@ -370,7 +370,7 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
               hasRequiredFastMathFlags(cast<FPMathOperator>(Cur), RK, FuncFMF))
         FMF &= *CurFMF;
       else
-        return std::nullopt;
+        return {};
     }
 
     Chain.insert(I);
@@ -388,7 +388,7 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
     bool AMatches = IA && TheLoop->contains(IA) && Matches(A, X, Y) == RK;
     bool BMatches = IB && TheLoop->contains(IB) && Matches(B, X, Y) == RK;
     if (AMatches == BMatches) // Both or neither match
-      return std::nullopt;
+      return {};
     WorkList.push_back(AMatches ? A : B);
   }
 
@@ -401,21 +401,22 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
   for (Use &U : RdxNext->uses()) {
     auto *User = cast<Instruction>(U.getUser());
     if (!TheLoop->contains(User->getParent())) {
-      if (++IncOut > 1)
-        return std::nullopt;
+      if (IncOut > 0)
+        return {};
+      IncOut++;
     } else if (auto *SI = dyn_cast<StoreInst>(User)) {
       const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
       if (U.getOperandNo() == SI->getPointerOperandIndex() ||
           !SE->isLoopInvariant(Ptr, TheLoop) ||
           (IntermediateStore &&
            SE->getSCEV(IntermediateStore->getPointerOperand()) != Ptr))
-        return std::nullopt;
+        return {};
       // Keep the store that appears last in the block, as it will be the final
       // reduction value.
       if (!IntermediateStore || IntermediateStore->comesBefore(SI))
         IntermediateStore = SI;
     } else if (Phi != User)
-      return std::nullopt;
+      return {};
   }
 
   // All ops on the chain from Phi to RdxNext must only be used by instructions
@@ -423,7 +424,7 @@ getMultiUseMinMax(PHINode *Phi, Loop *TheLoop, FastMathFlags FuncFMF,
   for (Value *Op : Chain)
     if (Op != RdxNext &&
         any_of(Op->users(), [&Chain](User *U) { return !Chain.contains(U); }))
-      return std::nullopt;
+      return {};
 
   SmallPtrSet<Instruction *, 4> Casts;
   return RecurrenceDescriptor(
@@ -1153,9 +1154,13 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
     LLVM_DEBUG(dbgs() << "Found a XOR reduction PHI." << *Phi << "\n");
     return true;
   }
-  if (auto RD = getMultiUseMinMax(Phi, TheLoop, FMF, SE)) {
+  auto RD = getMinMaxRecurrence(Phi, TheLoop, FMF, SE);
+  if (RD.getRecurrenceKind() != RecurKind::None) {
+    assert(
+        RecurrenceDescriptor::isMinMaxRecurrenceKind(RD.getRecurrenceKind()) &&
+        "must return a min/max recurrence kind");
     LLVM_DEBUG(dbgs() << "Found a min/max reduction PHI." << *Phi << "\n");
-    RedDes = *RD;
+    RedDes = RD;
     return true;
   }
   if (AddReductionVar(Phi, RecurKind::AnyOf, TheLoop, FMF, RedDes, DB, AC, DT,

>From ee0e1f9ac6a272031398a39af98142b6399ea161 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 12 Nov 2025 22:11:12 +0000
Subject: [PATCH 4/8] !fixup adjust code, thanks

---
 llvm/lib/Analysis/IVDescriptors.cpp | 58 +++++++++++++++++------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 3cde8c78e430d..c1bbfe93ea0fb 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -282,22 +282,32 @@ static FastMathFlags collectMinMaxFMF(Value *V) {
 static std::optional<FastMathFlags>
 hasRequiredFastMathFlags(FPMathOperator *FPOp, RecurKind &RK,
                          FastMathFlags FuncFMF) {
-  bool HasRequiredFMF =
-      (FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) ||
-      (FPOp && FPOp->hasNoNaNs() && FPOp->hasNoSignedZeros()) ||
-      RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
-      RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum;
-  if (!HasRequiredFMF) {
-    if (RK == RecurKind::FMax &&
-        match(FPOp, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
-      RK = RecurKind::FMaxNum;
-    else if (RK == RecurKind::FMin &&
-             match(FPOp, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
-      RK = RecurKind::FMinNum;
-    else
+  bool HasRequiredFMF = (FuncFMF.noNaNs() && FuncFMF.noSignedZeros()) ||
+                        (FPOp && FPOp->hasNoNaNs() && FPOp->hasNoSignedZeros());
+  if (HasRequiredFMF)
+    return collectMinMaxFMF(FPOp);
+
+  switch (RK) {
+  case RecurKind::FMinimum:
+  case RecurKind::FMaximum:
+  case RecurKind::FMinimumNum:
+  case RecurKind::FMaximumNum:
+    break;
+
+  case RecurKind::FMax:
+    if (!match(FPOp, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      return std::nullopt;
+    RK = RecurKind::FMaxNum;
+    break;
+  case RecurKind::FMin:
+    if (!match(FPOp, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
       return std::nullopt;
+    RK = RecurKind::FMinNum;
+    break;
+  default:
+    return std::nullopt;
   }
-  return {collectMinMaxFMF(FPOp)};
+  return collectMinMaxFMF(FPOp);
 }
 
 static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
@@ -312,7 +322,7 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
   if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || !Latch)
     return {};
 
-  auto Matches = [](Value *V, Value *&A, Value *&B) -> RecurKind {
+  auto GetMinMaxRK = [](Value *V, Value *&A, Value *&B) -> RecurKind {
     if (match(V, m_UMin(m_Value(A), m_Value(B))))
       return RecurKind::UMin;
     if (match(V, m_UMax(m_Value(A), m_Value(B))))
@@ -343,8 +353,8 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
   RecurKind RK = RecurKind::None;
   // Identify min/max recurrences by walking the def-use chains upwards,
   // starting at RdxNext.
-  SmallVector<Value *> WorkList = {RdxNext};
-  SmallPtrSet<Value *, 8> Chain = {Phi};
+  SmallVector<Value *> WorkList({RdxNext});
+  SmallPtrSet<Value *, 8> Chain({Phi});
   while (!WorkList.empty()) {
     Value *Cur = WorkList.pop_back_val();
     if (!Chain.insert(Cur).second)
@@ -358,7 +368,7 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
       continue;
     }
     Value *A, *B;
-    RecurKind CurRK = Matches(Cur, A, B);
+    RecurKind CurRK = GetMinMaxRK(Cur, A, B);
     if (CurRK == RecurKind::None || (RK != RecurKind::None && CurRK != RK))
       return {};
 
@@ -366,11 +376,11 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
     // For floating point recurrences, check we have the required fast-math
     // flags.
     if (RecurrenceDescriptor::isFPMinMaxRecurrenceKind(CurRK)) {
-      if (auto CurFMF =
-              hasRequiredFastMathFlags(cast<FPMathOperator>(Cur), RK, FuncFMF))
-        FMF &= *CurFMF;
-      else
+      auto CurFMF =
+          hasRequiredFastMathFlags(cast<FPMathOperator>(Cur), RK, FuncFMF);
+      if (!CurFMF)
         return {};
+      FMF &= *CurFMF;
     }
 
     Chain.insert(I);
@@ -385,8 +395,8 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
     Value *X, *Y;
     auto *IA = dyn_cast<Instruction>(A);
     auto *IB = dyn_cast<Instruction>(B);
-    bool AMatches = IA && TheLoop->contains(IA) && Matches(A, X, Y) == RK;
-    bool BMatches = IB && TheLoop->contains(IB) && Matches(B, X, Y) == RK;
+    bool AMatches = IA && TheLoop->contains(IA) && GetMinMaxRK(A, X, Y) == RK;
+    bool BMatches = IB && TheLoop->contains(IB) && GetMinMaxRK(B, X, Y) == RK;
     if (AMatches == BMatches) // Both or neither match
       return {};
     WorkList.push_back(AMatches ? A : B);

>From 6fd5211baf3b6449972711a3d490eaa9413636e3 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 13 Nov 2025 15:49:49 +0000
Subject: [PATCH 5/8] !fixup add message, refactor constructor

---
 llvm/include/llvm/Analysis/IVDescriptors.h | 3 ++-
 llvm/lib/Analysis/IVDescriptors.cpp        | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 68528b0719f77..b477915bbffb1 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -105,7 +105,8 @@ class RecurrenceDescriptor {
         IsSigned(Signed), IsOrdered(Ordered),
         PhiHasUsesOutsideReductionChain(PhiHasUsesOutsideReductionChain),
         MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
-    CastInsts.insert_range(CI);
+    if (CI)
+      CastInsts.insert_range(*CI);
     assert(
         (!PhiHasUsesOutsideReductionChain || isMinMaxRecurrenceKind(K)) &&
         "Only min/max recurrences are allowed to have multiple uses currently");
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index c1bbfe93ea0fb..9684e8ea161ab 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -218,7 +218,6 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   return true;
 }
 
-<<<<<<< HEAD
 /// Returns true if \p Phi is a min/max reduction matching \p Kind where \p Phi
 /// is used outside the reduction chain. This is common for loops selecting the
 /// index of a minimum/maximum value (argmin/argmax).
@@ -440,7 +439,7 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
   return RecurrenceDescriptor(
       Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),
       cast<Instruction>(RdxNext), IntermediateStore, RK, FMF, nullptr,
-      Phi->getType(), false, false, Casts, -1U);
+      Phi->getType());
 }
 
 bool RecurrenceDescriptor::AddReductionVar(
@@ -813,6 +812,7 @@ bool RecurrenceDescriptor::AddReductionVar(
       RecurrenceDescriptor(RdxStart, ExitInstruction, IntermediateStore, Kind,
                            FMF, ExactFPMathInst, RecurrenceType, IsSigned,
                            IsOrdered, CastInsts, MinWidthCastToRecurrenceType);
+
   return true;
 }
 
@@ -1168,7 +1168,7 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   if (RD.getRecurrenceKind() != RecurKind::None) {
     assert(
         RecurrenceDescriptor::isMinMaxRecurrenceKind(RD.getRecurrenceKind()) &&
-        "must return a min/max recurrence kind");
+        "Expected a min/max recurrence kind");
     LLVM_DEBUG(dbgs() << "Found a min/max reduction PHI." << *Phi << "\n");
     RedDes = RD;
     return true;

>From 8caf01f35a46f449c7928bbebc211ec8b224b8a1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 13 Nov 2025 16:34:45 +0000
Subject: [PATCH 6/8] !fixup remove left-over SmallPtrSet, addcomment

---
 llvm/lib/Analysis/IVDescriptors.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 9684e8ea161ab..cd3b0721f9523 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -414,6 +414,9 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
         return {};
       IncOut++;
     } else if (auto *SI = dyn_cast<StoreInst>(User)) {
+      // This check matches LVerLegality::isInvariantAddressOfReduction and
+      // enables vectorizing reductions with stores to invariant addresses in
+      // the loop, by sinking them outside the loop.
       const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
       if (U.getOperandNo() == SI->getPointerOperandIndex() ||
           !SE->isLoopInvariant(Ptr, TheLoop) ||
@@ -435,7 +438,6 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
         any_of(Op->users(), [&Chain](User *U) { return !Chain.contains(U); }))
       return {};
 
-  SmallPtrSet<Instruction *, 4> Casts;
   return RecurrenceDescriptor(
       Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),
       cast<Instruction>(RdxNext), IntermediateStore, RK, FMF, nullptr,

>From fe18a4e40ae70ccb55616ecfb03785e5c06410e2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 14 Nov 2025 15:41:48 +0000
Subject: [PATCH 7/8] !fixup handle all intermediate store cases.

---
 llvm/lib/Analysis/IVDescriptors.cpp | 74 ++++++++++++++++-------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index cd3b0721f9523..9142e64fdafcc 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -400,43 +400,53 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
       return {};
     WorkList.push_back(AMatches ? A : B);
   }
-
-  // Check users of RdxNext. It can have
-  // * a single user outside the loop,
-  // * used stores to the same invariant address,
-  // * used by the starting recurrence phis.
-  unsigned IncOut = 0;
+  // Validate uses: in-chain, stores to same invariant address, intermediate
+  // min/max, or single out-of-loop use of RdxNext.
   StoreInst *IntermediateStore = nullptr;
-  for (Use &U : RdxNext->uses()) {
-    auto *User = cast<Instruction>(U.getUser());
-    if (!TheLoop->contains(User->getParent())) {
-      if (IncOut > 0)
+  const SCEV *StorePtrSCEV = nullptr;
+  SmallPtrSet<Instruction *, 8> IntermediateMinMax;
+  unsigned OutOfLoopUses = 0;
+
+  for (Value *V : Chain) {
+    for (User *U : V->users()) {
+      if (Chain.contains(U) || (U == Phi && V == RdxNext))
+        continue;
+      auto *I = dyn_cast<Instruction>(U);
+      if (!I ||
+          (!TheLoop->contains(I) && (V != RdxNext || ++OutOfLoopUses > 1)))
         return {};
-      IncOut++;
-    } else if (auto *SI = dyn_cast<StoreInst>(User)) {
-      // This check matches LVerLegality::isInvariantAddressOfReduction and
-      // enables vectorizing reductions with stores to invariant addresses in
-      // the loop, by sinking them outside the loop.
-      const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
-      if (U.getOperandNo() == SI->getPointerOperandIndex() ||
-          !SE->isLoopInvariant(Ptr, TheLoop) ||
-          (IntermediateStore &&
-           SE->getSCEV(IntermediateStore->getPointerOperand()) != Ptr))
+      if (!TheLoop->contains(I))
+        continue;
+      if (auto *SI = dyn_cast<StoreInst>(I)) {
+        const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
+        if (!SE->isLoopInvariant(Ptr, TheLoop) ||
+            (StorePtrSCEV && StorePtrSCEV != Ptr))
+          return {};
+        StorePtrSCEV = StorePtrSCEV ? StorePtrSCEV : Ptr;
+        IntermediateStore =
+            !IntermediateStore || IntermediateStore->comesBefore(SI)
+                ? SI
+                : IntermediateStore;
+        continue;
+      }
+      Value *A, *B;
+      if (GetMinMaxRK(I, A, B) != RK)
         return {};
-      // Keep the store that appears last in the block, as it will be the final
-      // reduction value.
-      if (!IntermediateStore || IntermediateStore->comesBefore(SI))
-        IntermediateStore = SI;
-    } else if (Phi != User)
-      return {};
+      IntermediateMinMax.insert(I);
+    }
   }
 
-  // All ops on the chain from Phi to RdxNext must only be used by instructions
-  // in the chain.
-  for (Value *Op : Chain)
-    if (Op != RdxNext &&
-        any_of(Op->users(), [&Chain](User *U) { return !Chain.contains(U); }))
-      return {};
+  if (!IntermediateMinMax.empty() && !IntermediateStore)
+    return {};
+  for (Instruction *I : IntermediateMinMax)
+    for (User *U : I->users())
+      if (!Chain.contains(U) &&
+          (!isa<StoreInst>(U) ||
+           !SE->isLoopInvariant(
+               SE->getSCEV(cast<StoreInst>(U)->getPointerOperand()), TheLoop) ||
+           StorePtrSCEV !=
+               SE->getSCEV(cast<StoreInst>(U)->getPointerOperand())))
+        return {};
 
   return RecurrenceDescriptor(
       Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),

>From 6e26a6e2288706eacb7a234c9df1ab89ac56b49d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 16 Nov 2025 21:02:48 +0000
Subject: [PATCH 8/8] !fixup simplify after merge.

---
 llvm/include/llvm/Analysis/IVDescriptors.h    |  11 +-
 llvm/lib/Analysis/IVDescriptors.cpp           | 168 +++++++-----------
 .../partial-reduce-dot-product-neon.ll        |  39 ++--
 3 files changed, 93 insertions(+), 125 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index b477915bbffb1..4f742e8517ce2 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -105,13 +105,20 @@ class RecurrenceDescriptor {
         IsSigned(Signed), IsOrdered(Ordered),
         PhiHasUsesOutsideReductionChain(PhiHasUsesOutsideReductionChain),
         MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
-    if (CI)
-      CastInsts.insert_range(*CI);
+    CastInsts.insert_range(CI);
     assert(
         (!PhiHasUsesOutsideReductionChain || isMinMaxRecurrenceKind(K)) &&
         "Only min/max recurrences are allowed to have multiple uses currently");
   }
 
+  /// Simpler constructor for min/max recurrences that don't track cast
+  /// instructions.
+  RecurrenceDescriptor(Value *Start, Instruction *Exit, StoreInst *Store,
+                       RecurKind K, FastMathFlags FMF, Instruction *ExactFP,
+                       Type *RT)
+      : IntermediateStore(Store), StartValue(Start), LoopExitInstr(Exit),
+        Kind(K), FMF(FMF), ExactFPMathInst(ExactFP), RecurrenceType(RT) {}
+
   /// This POD struct holds information about a potential recurrence operation.
   class InstDesc {
   public:
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 9142e64fdafcc..f8f609f09b1a0 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -218,60 +218,13 @@ static bool checkOrderedReduction(RecurKind Kind, Instruction *ExactFPMathInst,
   return true;
 }
 
-/// Returns true if \p Phi is a min/max reduction matching \p Kind where \p Phi
-/// is used outside the reduction chain. This is common for loops selecting the
-/// index of a minimum/maximum value (argmin/argmax).
-static bool isMinMaxReductionPhiWithUsersOutsideReductionChain(
-    PHINode *Phi, RecurKind Kind, Loop *TheLoop, RecurrenceDescriptor &RedDes) {
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  if (!Latch)
-    return false;
-
-  assert(Phi->getNumIncomingValues() == 2 && "phi must have 2 incoming values");
-  Value *Inc = Phi->getIncomingValueForBlock(Latch);
-  if (Phi->hasOneUse() || !Inc->hasOneUse() ||
-      !RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
-    return false;
-
-  Value *A, *B;
-  bool IsMinMax = [&]() {
-    switch (Kind) {
-    case RecurKind::UMax:
-      return match(Inc, m_UMax(m_Value(A), m_Value(B)));
-    case RecurKind::UMin:
-      return match(Inc, m_UMin(m_Value(A), m_Value(B)));
-    case RecurKind::SMax:
-      return match(Inc, m_SMax(m_Value(A), m_Value(B)));
-    case RecurKind::SMin:
-      return match(Inc, m_SMin(m_Value(A), m_Value(B)));
-    default:
-      llvm_unreachable("all min/max kinds must be handled");
-    }
-  }();
-  if (!IsMinMax)
-    return false;
-
-  if (A == B || (A != Phi && B != Phi))
-    return false;
-
-  SmallPtrSet<Instruction *, 4> CastInsts;
-  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
-  RedDes =
-      RecurrenceDescriptor(RdxStart, /*Exit=*/nullptr, /*Store=*/nullptr, Kind,
-                           FastMathFlags(), /*ExactFP=*/nullptr, Phi->getType(),
-                           /*Signed=*/false, /*Ordered=*/false, CastInsts,
-                           /*MinWidthCastToRecurTy=*/-1U, /*PhiMultiUse=*/true);
-  return true;
-}
-
-// Helper to collect FMF from a value and its associated fcmp in select patterns
+// Collect FMF from a value and its associated fcmp in select patterns
 static FastMathFlags collectMinMaxFMF(Value *V) {
   FastMathFlags FMF = cast<FPMathOperator>(V)->getFastMathFlags();
   if (auto *Sel = dyn_cast<SelectInst>(V)) {
-    // Accept FMF on either fcmp or select of a min/max idiom.
-    // TODO: This is a hack to work-around the fact that FMF may not be
-    //       assigned/propagated correctly. If that problem is fixed or we
-    //       standardize on fmin/fmax via intrinsics, this can be removed.
+    // Accept FMF from either fcmp or select in a min/max idiom.
+    // TODO: Remove this when FMF propagation is fixed or we standardize on
+    // intrinsics.
     if (auto *FCmp = dyn_cast<FCmpInst>(Sel->getCondition()))
       FMF |= FCmp->getFastMathFlags();
   }
@@ -312,13 +265,11 @@ hasRequiredFastMathFlags(FPMathOperator *FPOp, RecurKind &RK,
 static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
                                                 FastMathFlags FuncFMF,
                                                 ScalarEvolution *SE) {
-  if (Phi->getNumIncomingValues() != 2 ||
-      Phi->getParent() != TheLoop->getHeader())
-    return {};
-
   Type *Ty = Phi->getType();
   BasicBlock *Latch = TheLoop->getLoopLatch();
-  if ((!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || !Latch)
+  if (Phi->getNumIncomingValues() != 2 ||
+      Phi->getParent() != TheLoop->getHeader() ||
+      (!Ty->isIntegerTy() && !Ty->isFloatingPointTy()) || !Latch)
     return {};
 
   auto GetMinMaxRK = [](Value *V, Value *&A, Value *&B) -> RecurKind {
@@ -348,11 +299,11 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
   };
 
   FastMathFlags FMF = FastMathFlags::getFast();
-  Value *RdxNext = Phi->getIncomingValueForBlock(Latch);
+  Value *BackedgeValue = Phi->getIncomingValueForBlock(Latch);
   RecurKind RK = RecurKind::None;
-  // Identify min/max recurrences by walking the def-use chains upwards,
-  // starting at RdxNext.
-  SmallVector<Value *> WorkList({RdxNext});
+  // Walk def-use chains upwards from BackedgeValue to identify min/max
+  // recurrences.
+  SmallVector<Value *> WorkList({BackedgeValue});
   SmallPtrSet<Value *, 8> Chain({Phi});
   while (!WorkList.empty()) {
     Value *Cur = WorkList.pop_back_val();
@@ -362,8 +313,7 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
     if (!I || !TheLoop->contains(I))
       return {};
     if (auto *PN = dyn_cast<PHINode>(I)) {
-      if (PN != Phi)
-        append_range(WorkList, PN->operands());
+      append_range(WorkList, PN->operands());
       continue;
     }
     Value *A, *B;
@@ -372,8 +322,7 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
       return {};
 
     RK = CurRK;
-    // For floating point recurrences, check we have the required fast-math
-    // flags.
+    // Check required fast-math flags for FP recurrences.
     if (RecurrenceDescriptor::isFPMinMaxRecurrenceKind(CurRK)) {
       auto CurFMF =
           hasRequiredFastMathFlags(cast<FPMathOperator>(Cur), RK, FuncFMF);
@@ -382,15 +331,14 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
       FMF &= *CurFMF;
     }
 
-    Chain.insert(I);
     if (auto *SI = dyn_cast<SelectInst>(I))
       Chain.insert(SI->getCondition());
 
     if (A == Phi || B == Phi)
       continue;
 
-    // Add operand to worklist if it matches the pattern - exactly one must
-    // match
+    // Add operand to worklist if it matches the pattern (exactly one must
+    // match)
     Value *X, *Y;
     auto *IA = dyn_cast<Instruction>(A);
     auto *IB = dyn_cast<Instruction>(B);
@@ -400,57 +348,76 @@ static RecurrenceDescriptor getMinMaxRecurrence(PHINode *Phi, Loop *TheLoop,
       return {};
     WorkList.push_back(AMatches ? A : B);
   }
-  // Validate uses: in-chain, stores to same invariant address, intermediate
-  // min/max, or single out-of-loop use of RdxNext.
-  StoreInst *IntermediateStore = nullptr;
-  const SCEV *StorePtrSCEV = nullptr;
-  SmallPtrSet<Instruction *, 8> IntermediateMinMax;
-  unsigned OutOfLoopUses = 0;
 
+  // Handle argmin/argmax pattern: PHI has uses outside the reduction chain
+  // that are not intermediate min/max operations (which are handled below).
+  // Requires integer min/max, and single-use BackedgeValue (so vectorizer can
+  // handle both PHIs together).
+  bool PhiHasInvalidUses = any_of(Phi->users(), [&](User *U) {
+    Value *A, *B;
+    return !Chain.contains(U) && TheLoop->contains(cast<Instruction>(U)) &&
+           GetMinMaxRK(U, A, B) == RecurKind::None;
+  });
+  if (PhiHasInvalidUses) {
+    if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RK) ||
+        !BackedgeValue->hasOneUse())
+      return {};
+    SmallPtrSet<Instruction *, 4> CastInsts;
+    return RecurrenceDescriptor(
+        Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),
+        /*Exit=*/nullptr, /*Store=*/nullptr, RK, FastMathFlags(),
+        /*ExactFP=*/nullptr, Phi->getType(), /*Signed=*/false,
+        /*Ordered=*/false, CastInsts, /*MinWidthCastToRecurTy=*/-1U,
+        /*PhiMultiUse=*/true);
+  }
+
+  // Validate chain entries and collect stores from chain entries and
+  // intermediate ops.
+  SmallVector<StoreInst *> Stores;
+  unsigned OutOfLoopUses = 0;
   for (Value *V : Chain) {
     for (User *U : V->users()) {
-      if (Chain.contains(U) || (U == Phi && V == RdxNext))
+      if (Chain.contains(U))
         continue;
       auto *I = dyn_cast<Instruction>(U);
-      if (!I ||
-          (!TheLoop->contains(I) && (V != RdxNext || ++OutOfLoopUses > 1)))
+      if (!I || (!TheLoop->contains(I) &&
+                 (V != BackedgeValue || ++OutOfLoopUses > 1)))
         return {};
       if (!TheLoop->contains(I))
         continue;
       if (auto *SI = dyn_cast<StoreInst>(I)) {
-        const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
-        if (!SE->isLoopInvariant(Ptr, TheLoop) ||
-            (StorePtrSCEV && StorePtrSCEV != Ptr))
-          return {};
-        StorePtrSCEV = StorePtrSCEV ? StorePtrSCEV : Ptr;
-        IntermediateStore =
-            !IntermediateStore || IntermediateStore->comesBefore(SI)
-                ? SI
-                : IntermediateStore;
+        Stores.push_back(SI);
         continue;
       }
+      // Must be intermediate min/max of the same kind.
       Value *A, *B;
       if (GetMinMaxRK(I, A, B) != RK)
         return {};
-      IntermediateMinMax.insert(I);
+      for (User *IU : I->users()) {
+        if (auto *SI = dyn_cast<StoreInst>(IU))
+          Stores.push_back(SI);
+        else if (!Chain.contains(IU))
+          return {};
+      }
     }
   }
 
-  if (!IntermediateMinMax.empty() && !IntermediateStore)
-    return {};
-  for (Instruction *I : IntermediateMinMax)
-    for (User *U : I->users())
-      if (!Chain.contains(U) &&
-          (!isa<StoreInst>(U) ||
-           !SE->isLoopInvariant(
-               SE->getSCEV(cast<StoreInst>(U)->getPointerOperand()), TheLoop) ||
-           StorePtrSCEV !=
-               SE->getSCEV(cast<StoreInst>(U)->getPointerOperand())))
-        return {};
+  // Validate all stores go to same invariant address.
+  StoreInst *IntermediateStore = nullptr;
+  const SCEV *StorePtrSCEV = nullptr;
+  for (StoreInst *SI : Stores) {
+    const SCEV *Ptr = SE->getSCEV(SI->getPointerOperand());
+    if (!SE->isLoopInvariant(Ptr, TheLoop) ||
+        (StorePtrSCEV && StorePtrSCEV != Ptr))
+      return {};
+    StorePtrSCEV = Ptr;
+    if (!IntermediateStore || IntermediateStore->comesBefore(SI))
+      IntermediateStore = SI;
+  }
 
   return RecurrenceDescriptor(
       Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()),
-      cast<Instruction>(RdxNext), IntermediateStore, RK, FMF, nullptr,
+      cast<Instruction>(BackedgeValue), IntermediateStore, RK, FMF, nullptr,
       Phi->getType());
 }
 
@@ -465,11 +432,6 @@ bool RecurrenceDescriptor::AddReductionVar(
   if (Phi->getParent() != TheLoop->getHeader())
     return false;
 
-  // Check for min/max reduction variables that feed other users in the loop.
-  if (isMinMaxReductionPhiWithUsersOutsideReductionChain(Phi, Kind, TheLoop,
-                                                         RedDes))
-    return true;
-
   // Obtain the reduction start value from the value that comes from the loop
   // preheader.
   Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index e6f8439d35779..0d9c9055372dc 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -1889,7 +1889,6 @@ for.exit:                        ; preds = %for.body
   ret i32 %result
 }
 
-<<<<<<< HEAD
 define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_ext_mul(
 ; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] {
@@ -2393,9 +2392,9 @@ loop:
   %load.ext = sext i16 %load to i32
   %load.ext.ext = sext i32 %load.ext to i64
   %ec = icmp eq i64 %iv, %n
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %end, label %loop
 
-for.exit:
+end:
   ret i32 %add
 }
 
@@ -2419,15 +2418,15 @@ define i32 @zext_add_reduc_i8_i32_store_invariant(ptr %a, ptr %dst) {
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META19:![0-9]+]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = add <16 x i32> [[TMP1]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
-; CHECK-INTERLEAVE1-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META23:![0-9]+]], !noalias [[META19]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
@@ -2451,20 +2450,20 @@ define i32 @zext_add_reduc_i8_i32_store_invariant(ptr %a, ptr %dst) {
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1, !alias.scope [[META11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META19:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1, !alias.scope [[META19]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = add <16 x i32> [[TMP2]], [[VEC_PHI]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5]] = add <16 x i32> [[TMP3]], [[VEC_PHI2]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP5]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    store i32 [[TMP7]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-INTERLEAVED-NEXT:    store i32 [[TMP7]], ptr [[GEP_DST]], align 4, !alias.scope [[META23:![0-9]+]], !noalias [[META19]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
@@ -2487,26 +2486,26 @@ define i32 @zext_add_reduc_i8_i32_store_invariant(ptr %a, ptr %dst) {
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1, !alias.scope [[META19:![0-9]+]]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP3]] = add <16 x i32> [[TMP1]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]])
-; CHECK-MAXBW-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META11]]
+; CHECK-MAXBW-NEXT:    store i32 [[TMP4]], ptr [[GEP_DST]], align 4, !alias.scope [[META23:![0-9]+]], !noalias [[META19]]
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
 entry:
   %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
   store i32 0, ptr %gep.dst, align 4
-  br label %for.body
+  br label %loop
 
-for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %loop ]
   %gep.a = getelementptr i8, ptr %a, i64 %iv
   %load.a = load i8, ptr %gep.a, align 1
   %ext.a = zext i8 %load.a to i32
@@ -2514,9 +2513,9 @@ for.body:
   store i32 %add, ptr %gep.dst, align 4
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
-  br i1 %exitcond.not, label %for.exit, label %for.body
+  br i1 %exitcond.not, label %end, label %for.body
 
-for.exit:
+end:
   ret i32 %add
 }