[llvm] [RFC][llvm] Added llvm.loop.vectorize.reassociate_fpreductions.enable metadata. (PR #141685)

Wed Jun 11 11:48:20 PDT 2025

https://github.com/vzakhari updated https://github.com/llvm/llvm-project/pull/141685

>From 1619ad67557dc0495fdcd2e5b8be84b51b80df91 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Tue, 27 May 2025 15:58:17 -0700
Subject: [PATCH 1/5] [RFC][llvm] Added
 llvm.loop.vectorize.reassociation.enable metadata.

This metadata allows unsafe reassociations of computations during
the loop vectorization. For example, it allows vectorizing loops
with floating-point reductions without the need to compile the whole
function/program with -fassociative-math.
---
 llvm/docs/LangRef.rst                         | 16 +++++++
 .../Vectorize/LoopVectorizationLegality.h     | 14 +++++-
 .../Vectorize/LoopVectorizationLegality.cpp   |  8 +++-
 .../LoopVectorize/reduction-reassociate.ll    | 47 +++++++++++++++++++
 4 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 6a4bf6e594d14..b0f42bafd85c1 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -7593,6 +7593,22 @@ Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving
 multiple iterations of the loop. If ``llvm.loop.interleave.count`` is set to 0
 then the interleave count will be determined automatically.
 
+'``llvm.loop.vectorize.reassociation.enable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata selectively allows or disallows reassociating computations,
+which otherwise may be unsafe to reassociate, during the loop vectorization.
+For example, a floating point ``ADD`` reduction without ``reassoc`` fast-math
+flags may be vectorized provided that this metadata allows it. The first
+operand is the string ``llvm.loop.vectorize.reassociation.enable``
+and the second operand is a bit. If the bit operand value is 1 unsafe
+reassociations aqre enabled. A value of 0 disables unsafe reassociations.
+
+.. code-block:: llvm
+
+   !0 = !{!"llvm.loop.vectorize.reassociation.enable", i1 0}
+   !1 = !{!"llvm.loop.vectorize.reassociation.enable", i1 1}
+
 '``llvm.loop.vectorize.enable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index d654ac3ec9273..fb91eb022daf6 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -64,7 +64,8 @@ class LoopVectorizeHints {
     HK_FORCE,
     HK_ISVECTORIZED,
     HK_PREDICATE,
-    HK_SCALABLE
+    HK_SCALABLE,
+    HK_REASSOCIATE,
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -97,6 +98,10 @@ class LoopVectorizeHints {
   /// Says whether we should use fixed width or scalable vectorization.
   Hint Scalable;
 
+  /// Says whether unsafe reassociation of computations is allowed
+  /// during the loop vectorization.
+  Hint Reassociate;
+
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
 
@@ -162,6 +167,13 @@ class LoopVectorizeHints {
     return (ScalableForceKind)Scalable.Value == SK_FixedWidthOnly;
   }
 
+  enum ForceKind getReassociate() const {
+    if ((ForceKind)Reassociate.Value == FK_Undefined &&
+        hasDisableAllTransformsHint(TheLoop))
+      return FK_Disabled;
+    return (ForceKind)Reassociate.Value;
+  }
+
   /// If hints are provided that force vectorization, use the AlwaysPrint
   /// pass name to force the frontend to print the diagnostic.
   const char *vectorizeAnalysisPassName() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8e09e6f8d4935..ec3194f754664 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -97,6 +97,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_ISVECTORIZED:
   case HK_PREDICATE:
   case HK_SCALABLE:
+  case HK_REASSOCIATE:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -112,6 +113,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
+      Reassociate("vectorize.reassociation.enable", FK_Undefined,
+                  HK_REASSOCIATE),
       TheLoop(L), ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
@@ -251,6 +254,7 @@ bool LoopVectorizeHints::allowReordering() const {
   ElementCount EC = getWidth();
   return HintsAllowReordering &&
          (getForce() == LoopVectorizeHints::FK_Enabled ||
+          getReassociate() == LoopVectorizeHints::FK_Enabled ||
           EC.getKnownMinValue() > 1);
 }
 
@@ -300,8 +304,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,        &Interleave, &Force,
-                   &IsVectorized, &Predicate,  &Scalable};
+  Hint *Hints[] = {&Width,     &Interleave, &Force,      &IsVectorized,
+                   &Predicate, &Scalable,   &Reassociate};
   for (auto *H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
new file mode 100644
index 0000000000000..ffe69596545a9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
@@ -0,0 +1,47 @@
+; Check that the loop with a floating-point reduction is vectorized
+; due to llvm.loop.vectorize.reassociation.enable metadata.
+; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+source_filename = "FIRModule"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
+define void @test_(ptr captures(none) %0, ptr readonly captures(none) %1) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @test_(
+; CHECK-NEXT:    fadd contract <4 x float> {{.*}}
+; CHECK-NEXT:    call contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> {{.*}})
+;
+  %invariant.gep = getelementptr i8, ptr %1, i64 -4
+  %.promoted = load float, ptr %0, align 4
+  br label %3
+
+3:                                                ; preds = %2, %3
+  %indvars.iv = phi i64 [ 1, %2 ], [ %indvars.iv.next, %3 ]
+  %4 = phi float [ %.promoted, %2 ], [ %6, %3 ]
+  %gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
+  %5 = load float, ptr %gep, align 4
+  %6 = fadd contract float %4, %5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 1001
+  br i1 %exitcond.not, label %7, label %3, !llvm.loop !2
+
+7:                                                ; preds = %3
+  %.lcssa = phi float [ %6, %3 ]
+  store float %.lcssa, ptr %0, align 4
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "target-cpu"="x86-64" }
+
+!llvm.ident = !{!0}
+!llvm.module.flags = !{!1}
+
+!0 = !{!"flang version 21.0.0"}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = distinct !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.reassociation.enable", i1 true}
+
+; CHECK-NOT: llvm.loop.vectorize.reassociation.enable
+; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}

>From 9511b6e2e10ce539519e9a7f446ccd0f7dd39d84 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Tue, 27 May 2025 17:41:20 -0700
Subject: [PATCH 2/5] Fixed test.

---
 .../Transforms/LoopVectorize/reduction-reassociate.ll     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
index ffe69596545a9..e35ad858b8d89 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
@@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
 define void @test_(ptr captures(none) %0, ptr readonly captures(none) %1) local_unnamed_addr #0 {
 ; CHECK-LABEL: define void @test_(
-; CHECK-NEXT:    fadd contract <4 x float> {{.*}}
-; CHECK-NEXT:    call contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> {{.*}})
+; CHECK:    fadd contract <4 x float> {{.*}}
+; CHECK:    call contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> {{.*}})
 ;
   %invariant.gep = getelementptr i8, ptr %1, i64 -4
   %.promoted = load float, ptr %0, align 4
@@ -43,5 +43,5 @@ attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "ta
 !3 = !{!"llvm.loop.vectorize.reassociation.enable", i1 true}
 
 ; CHECK-NOT: llvm.loop.vectorize.reassociation.enable
-; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: !{!"llvm.loop.unroll.runtime.disable"}

>From 5ba9cbd40cbb8dcd2129060cf171655d9efd1c58 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Mon, 2 Jun 2025 20:02:32 -0700
Subject: [PATCH 3/5] Made metadata specific to FP reductions.

---
 llvm/docs/LangRef.rst                         | 22 +++++----
 .../Vectorize/LoopVectorizationLegality.h     | 16 ++++---
 .../Vectorize/LoopVectorizationLegality.cpp   | 46 ++++++++++++-------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  5 +-
 .../LoopVectorize/reduction-reassociate.ll    |  6 +--
 5 files changed, 57 insertions(+), 38 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b0f42bafd85c1..ed5fc5b6c5769 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -7593,21 +7593,23 @@ Note that setting ``llvm.loop.interleave.count`` to 1 disables interleaving
 multiple iterations of the loop. If ``llvm.loop.interleave.count`` is set to 0
 then the interleave count will be determined automatically.
 
-'``llvm.loop.vectorize.reassociation.enable``' Metadata
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+'``llvm.loop.vectorize.reassociate_fpreductions.enable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-This metadata selectively allows or disallows reassociating computations,
-which otherwise may be unsafe to reassociate, during the loop vectorization.
-For example, a floating point ``ADD`` reduction without ``reassoc`` fast-math
-flags may be vectorized provided that this metadata allows it. The first
-operand is the string ``llvm.loop.vectorize.reassociation.enable``
+This metadata selectively allows or disallows reassociating floating-point
+reductions, which otherwise may be unsafe to reassociate, during the loop
+vectorization. For example, a floating point ``ADD`` reduction without
+``reassoc`` fast-math flags may be vectorized provided that this metadata
+allows it. The first operand is the string
+``llvm.loop.vectorize.reassociate_fpreductions.enable``
 and the second operand is a bit. If the bit operand value is 1 unsafe
-reassociations aqre enabled. A value of 0 disables unsafe reassociations.
+reduction reassociations are enabled. A value of 0 disables unsafe
+reduction reassociations.
 
 .. code-block:: llvm
 
-   !0 = !{!"llvm.loop.vectorize.reassociation.enable", i1 0}
-   !1 = !{!"llvm.loop.vectorize.reassociation.enable", i1 1}
+   !0 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 0}
+   !1 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 1}
 
 '``llvm.loop.vectorize.enable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index fb91eb022daf6..5911501ca2d3e 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -65,7 +65,7 @@ class LoopVectorizeHints {
     HK_ISVECTORIZED,
     HK_PREDICATE,
     HK_SCALABLE,
-    HK_REASSOCIATE,
+    HK_REASSOCIATE_FP_REDUCTIONS,
   };
 
   /// Hint - associates name and validation with the hint value.
@@ -98,9 +98,9 @@ class LoopVectorizeHints {
   /// Says whether we should use fixed width or scalable vectorization.
   Hint Scalable;
 
-  /// Says whether unsafe reassociation of computations is allowed
+  /// Says whether unsafe reassociation of reductions is allowed
   /// during the loop vectorization.
-  Hint Reassociate;
+  Hint ReassociateFPReductions;
 
   /// Return the loop metadata prefix.
   static StringRef Prefix() { return "llvm.loop."; }
@@ -167,11 +167,11 @@ class LoopVectorizeHints {
     return (ScalableForceKind)Scalable.Value == SK_FixedWidthOnly;
   }
 
-  enum ForceKind getReassociate() const {
-    if ((ForceKind)Reassociate.Value == FK_Undefined &&
+  enum ForceKind getReassociateFPReductions() const {
+    if ((ForceKind)ReassociateFPReductions.Value == FK_Undefined &&
         hasDisableAllTransformsHint(TheLoop))
       return FK_Disabled;
-    return (ForceKind)Reassociate.Value;
+    return (ForceKind)ReassociateFPReductions.Value;
   }
 
   /// If hints are provided that force vectorization, use the AlwaysPrint
@@ -185,6 +185,10 @@ class LoopVectorizeHints {
   /// error accumulates in the loop.
   bool allowReordering() const;
 
+  /// Returns true iff the loop hints allow reassociating floating-point
+  /// reductions for the purpose of vectorization.
+  bool allowFPReductionReassociation() const;
+
   bool isPotentiallyUnsafe() const {
     // Avoid FP vectorization if the target is unsure about proper support.
     // This may be related to the SIMD unit in the target not handling
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ec3194f754664..dffff6f7278a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -97,7 +97,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
   case HK_ISVECTORIZED:
   case HK_PREDICATE:
   case HK_SCALABLE:
-  case HK_REASSOCIATE:
+  case HK_REASSOCIATE_FP_REDUCTIONS:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -113,8 +113,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
       Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
-      Reassociate("vectorize.reassociation.enable", FK_Undefined,
-                  HK_REASSOCIATE),
+      ReassociateFPReductions("vectorize.reassociate_fpreductions.enable",
+                              FK_Undefined, HK_REASSOCIATE_FP_REDUCTIONS),
       TheLoop(L), ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
@@ -254,10 +254,14 @@ bool LoopVectorizeHints::allowReordering() const {
   ElementCount EC = getWidth();
   return HintsAllowReordering &&
          (getForce() == LoopVectorizeHints::FK_Enabled ||
-          getReassociate() == LoopVectorizeHints::FK_Enabled ||
           EC.getKnownMinValue() > 1);
 }
 
+bool LoopVectorizeHints::allowFPReductionReassociation() const {
+  return HintsAllowReordering &&
+         getReassociateFPReductions() == LoopVectorizeHints::FK_Enabled;
+}
+
 void LoopVectorizeHints::getHintsFromMetadata() {
   MDNode *LoopID = TheLoop->getLoopID();
   if (!LoopID)
@@ -304,8 +308,13 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
     return;
   unsigned Val = C->getZExtValue();
 
-  Hint *Hints[] = {&Width,     &Interleave, &Force,      &IsVectorized,
-                   &Predicate, &Scalable,   &Reassociate};
+  Hint *Hints[] = {&Width,
+                   &Interleave,
+                   &Force,
+                   &IsVectorized,
+                   &Predicate,
+                   &Scalable,
+                   &ReassociateFPReductions};
   for (auto *H : Hints) {
     if (Name == H->Name) {
       if (H->validate(Val))
@@ -1315,22 +1324,25 @@ bool LoopVectorizationLegality::canVectorizeFPMath(
     return true;
 
   // If the above is false, we have ExactFPMath & do not allow reordering.
-  // If the EnableStrictReductions flag is set, first check if we have any
-  // Exact FP induction vars, which we cannot vectorize.
-  if (!EnableStrictReductions ||
-      any_of(getInductionVars(), [&](auto &Induction) -> bool {
+  // First check if we have any Exact FP induction vars, which we cannot
+  // vectorize.
+  if (any_of(getInductionVars(), [&](auto &Induction) -> bool {
         InductionDescriptor IndDesc = Induction.second;
         return IndDesc.getExactFPMathInst();
       }))
     return false;
 
-  // We can now only vectorize if all reductions with Exact FP math also
-  // have the isOrdered flag set, which indicates that we can move the
-  // reduction operations in-loop.
-  return (all_of(getReductionVars(), [&](auto &Reduction) -> bool {
-    const RecurrenceDescriptor &RdxDesc = Reduction.second;
-    return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
-  }));
+  // We can now only vectorize if EnableStrictReductions flag is set and
+  // all reductions with Exact FP math also have the isOrdered flag set,
+  // which indicates that we can move the reduction operations in-loop.
+  // If the hints allow reassociating FP reductions, then skip
+  // all the checks.
+  return (Hints->allowFPReductionReassociation() ||
+          all_of(getReductionVars(), [&](auto &Reduction) -> bool {
+            const RecurrenceDescriptor &RdxDesc = Reduction.second;
+            return !RdxDesc.hasExactFPMath() ||
+                   (EnableStrictReductions && RdxDesc.isOrdered());
+          }));
 }
 
 bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fc8ebebcf21b7..608715453e40d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1000,9 +1000,10 @@ class LoopVectorizationCostModel {
   /// Returns true if we should use strict in-order reductions for the given
   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
-  /// of FP operations.
+  /// of FP operations or FP reductions.
   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
-    return !Hints->allowReordering() && RdxDesc.isOrdered();
+    return !Hints->allowReordering() &&
+           !Hints->allowFPReductionReassociation() && RdxDesc.isOrdered();
   }
 
   /// \returns The smallest bitwidth each instruction can be represented with.
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
index e35ad858b8d89..08b08d2d405b6 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
@@ -1,5 +1,5 @@
 ; Check that the loop with a floating-point reduction is vectorized
-; due to llvm.loop.vectorize.reassociation.enable metadata.
+; due to llvm.loop.vectorize.reassociate_fpreductions.enable metadata.
 ; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s
 
 source_filename = "FIRModule"
@@ -40,8 +40,8 @@ attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "ta
 !0 = !{!"flang version 21.0.0"}
 !1 = !{i32 2, !"Debug Info Version", i32 3}
 !2 = distinct !{!2, !3}
-!3 = !{!"llvm.loop.vectorize.reassociation.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 true}
 
-; CHECK-NOT: llvm.loop.vectorize.reassociation.enable
+; CHECK-NOT: llvm.loop.vectorize.reassociate_fpreductions.enable
 ; CHECK: !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: !{!"llvm.loop.unroll.runtime.disable"}

>From 91f390e8d82e4e5b8c6667c54f621c7d6c842637 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Tue, 10 Jun 2025 17:44:22 -0700
Subject: [PATCH 4/5] Updated LangRef and the test.

---
 llvm/docs/LangRef.rst                         |  14 +-
 .../LoopVectorize/reduction-reassociate.ll    | 151 ++++++++++++++----
 2 files changed, 130 insertions(+), 35 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ed5fc5b6c5769..6cd7321d0c4e0 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -7597,7 +7597,7 @@ then the interleave count will be determined automatically.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This metadata selectively allows or disallows reassociating floating-point
-reductions, which otherwise may be unsafe to reassociate, during the loop
+reductions, which otherwise may be unsafe to reassociate, during loop
 vectorization. For example, a floating point ``ADD`` reduction without
 ``reassoc`` fast-math flags may be vectorized provided that this metadata
 allows it. The first operand is the string
@@ -7606,6 +7606,18 @@ and the second operand is a bit. If the bit operand value is 1 unsafe
 reduction reassociations are enabled. A value of 0 disables unsafe
 reduction reassociations.
 
+Note that the reassociation of floating point reductions that is allowed
+by other means is considered safe, so this metadata is a no-op
+in such cases.
+
+For example, reassociation of floating point reduction
+in a loop with ``!{!"llvm.loop.vectorize.enable", i1 1}`` metadata is allowed
+regardless of the value of
+``llvm.loop.vectorize.reassociate_fpreductions.enable``.
+
+Similarly, the reassociation is allowed for reduction operations
+with ``reassoc`` fast-math flags always.
+
 .. code-block:: llvm
 
    !0 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 0}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
index 08b08d2d405b6..1e760c841f3dd 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
@@ -1,47 +1,130 @@
-; Check that the loop with a floating-point reduction is vectorized
-; due to llvm.loop.vectorize.reassociate_fpreductions.enable metadata.
-; RUN: opt -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; Check that the loops with a floating-point reduction are vectorized
+; according to llvm.loop.vectorize.reassociate_fpreductions.enable metadata.
+; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
 
-source_filename = "FIRModule"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite)
-define void @test_(ptr captures(none) %0, ptr readonly captures(none) %1) local_unnamed_addr #0 {
-; CHECK-LABEL: define void @test_(
+define float @test_enable(ptr readonly captures(none) %array, float %init) {
+; CHECK-LABEL: define float @test_enable(
 ; CHECK:    fadd contract <4 x float> {{.*}}
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD0:[0-9]+]]
 ; CHECK:    call contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> {{.*}})
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD3:[0-9]+]]
 ;
-  %invariant.gep = getelementptr i8, ptr %1, i64 -4
-  %.promoted = load float, ptr %0, align 4
-  br label %3
-
-3:                                                ; preds = %2, %3
-  %indvars.iv = phi i64 [ 1, %2 ], [ %indvars.iv.next, %3 ]
-  %4 = phi float [ %.promoted, %2 ], [ %6, %3 ]
-  %gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv
-  %5 = load float, ptr %gep, align 4
-  %6 = fadd contract float %4, %5
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, 1001
-  br i1 %exitcond.not, label %7, label %3, !llvm.loop !2
-
-7:                                                ; preds = %3
-  %.lcssa = phi float [ %6, %3 ]
-  store float %.lcssa, ptr %0, align 4
-  ret void
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi float [ %init, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr float, ptr %array, i64 %iv
+  %element = load float, ptr %gep, align 4
+  %red.next = fadd contract float %red, %element
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  %result = phi float [ %red.next, %loop ]
+  ret float %result
 }
 
-attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) "target-cpu"="x86-64" }
+; The reduction is unsafe, and the metadata does not allow
+; vectorizing it:
+define float @test_disable(ptr readonly captures(none) %array, float %init) {
+; CHECK-LABEL: define float @test_disable(
+; CHECK-NOT:    <4 x float>
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD4:[0-9]+]]
+;
+entry:
+  br label %loop
 
-!llvm.ident = !{!0}
-!llvm.module.flags = !{!1}
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi float [ %init, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr float, ptr %array, i64 %iv
+  %element = load float, ptr %gep, align 4
+  %red.next = fadd contract float %red, %element
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
+
+exit:
+  %result = phi float [ %red.next, %loop ]
+  ret float %result
+}
+
+; Forced vectorization "makes" the reduction reassociation safe,
+; so setting llvm.loop.vectorize.reassociate_fpreductions.enable
+; to false does not have effect:
+define float @test_disable_with_forced_vectorization(ptr readonly captures(none) %array, float %init) {
+; CHECK-LABEL: define float @test_disable_with_forced_vectorization(
+; CHECK:    fadd contract <4 x float> {{.*}}
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD6:[0-9]+]]
+; CHECK:    call contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> {{.*}})
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD7:[0-9]+]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi float [ %init, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr float, ptr %array, i64 %iv
+  %element = load float, ptr %gep, align 4
+  %red.next = fadd contract float %red, %element
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !4
+
+exit:
+  %result = phi float [ %red.next, %loop ]
+  ret float %result
+}
+
+; 'fast' math makes reduction reassociation safe,
+; so setting llvm.loop.vectorize.reassociate_fpreductions.enable
+; to false does not have effect:
+define float @test_disable_with_fast_math(ptr readonly captures(none) %array, float %init) {
+; CHECK-LABEL: define float @test_disable_with_fast_math(
+; CHECK:    fadd fast <4 x float> {{.*}}
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD8:[0-9]+]]
+; CHECK:    call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> {{.*}})
+; CHECK:    br i1 %{{.*}}, !llvm.loop ![[MD9:[0-9]+]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi float [ %init, %entry ], [ %red.next, %loop ]
+  %gep = getelementptr float, ptr %array, i64 %iv
+  %element = load float, ptr %gep, align 4
+  %red.next = fadd fast float %red, %element
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
+
+exit:
+  %result = phi float [ %red.next, %loop ]
+  ret float %result
+}
 
-!0 = !{!"flang version 21.0.0"}
-!1 = !{i32 2, !"Debug Info Version", i32 3}
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 true}
 !2 = distinct !{!2, !3}
-!3 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 false}
+!4 = distinct !{!4, !3, !5}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ; CHECK-NOT: llvm.loop.vectorize.reassociate_fpreductions.enable
-; CHECK: !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: ![[MD0]] = distinct !{![[MD0]], ![[MD1:[0-9]+]], ![[MD2:[0-9]+]]}
+; CHECK: ![[MD1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: ![[MD2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: ![[MD3]] = distinct !{![[MD3]], ![[MD2]], ![[MD1]]}
+; CHECK: ![[MD4]] = distinct !{![[MD4]], ![[MD5:[0-9]+]]}
+; CHECK: ![[MD5]] = !{!"llvm.loop.vectorize.reassociate_fpreductions.enable", i1 false}
+; CHECK: ![[MD6]] = distinct !{![[MD6]], ![[MD1]], ![[MD2]]}
+; CHECK: ![[MD7]] = distinct !{![[MD7]], ![[MD2]], ![[MD1]]}
+; CHECK: ![[MD8]] = distinct !{![[MD8]], ![[MD1]], ![[MD2]]}
+; CHECK: ![[MD9]] = distinct !{![[MD9]], ![[MD2]], ![[MD1]]}

>From 676dedebdb336661b44b25fce2ba3f587c7eb04d Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 11 Jun 2025 11:47:55 -0700
Subject: [PATCH 5/5] Moved test to X86 dir.

---
 .../Transforms/LoopVectorize/{ => X86}/reduction-reassociate.ll   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/Transforms/LoopVectorize/{ => X86}/reduction-reassociate.ll (100%)

diff --git a/llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-reassociate.ll
similarity index 100%
rename from llvm/test/Transforms/LoopVectorize/reduction-reassociate.ll
rename to llvm/test/Transforms/LoopVectorize/X86/reduction-reassociate.ll