[clang] [Clang] Add `__builtin_reduce_[in|any]_order_fadd` for floating-point reductions (PR #176160)

Fri Feb 20 07:24:48 PST 2026

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/176160

>From 2fad251f3919d131495800802ea545e0aa40112b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 14 Jan 2026 17:53:39 +0000
Subject: [PATCH 1/8] [Clang] Add `__builtin_reduce_addf` for ordered/unordered
 fp reductions

This adds `__builtin_reduce_addf` to expose the `llvm.vector.reduce.fadd.*`
intrinsic directly in Clang, for the full range of supported FP types.

Given a floating-point vector `vec` and a scalar floating-point value `acc`:

- `__builtin_reduce_addf(vec)` corresponds to an unordered/fast reduction
  * i.e, the lanes can be summed in any order
- `__builtin_reduce_addf(vec, acc)` corresponds to an ordered redunction
  * i.e, the result is as-if an accumulator was initialized with `acc`
    and each lane was added to it in-order, starting from lane 0

The `acc` is only used for ordered reductions as the original motivation
for adding the "start_value/acc" in the intrinsic was to distinguish
between ordered/unordered reductions, see: https://reviews.llvm.org/D30086.
---
 clang/docs/LanguageExtensions.rst            |  4 ++
 clang/include/clang/Basic/Builtins.td        |  6 +++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp              | 22 ++++++++
 clang/lib/Sema/SemaChecking.cpp              | 53 +++++++++++++++++---
 clang/test/CodeGen/builtins-reduction-math.c | 23 +++++++++
 clang/test/Sema/builtins-reduction-math.c    | 17 +++++++
 7 files changed, 119 insertions(+), 7 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 29328355c3e6f..2109679d20dcd 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -946,6 +946,10 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
                                          semantics, see `LangRef
                                          <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
                                          for the comparison.
+ ET __builtin_reduce_addf(VT a)          unordered floating-point add reduction.                                floating point types
+ ET __builtin_reduce_addf(VT a, ET s)    ordered floating-point add reduction, initializing the accumulator     floating point types
+                                         with `(ET)s`, then adding each lane of the `a` in-order, starting from
+                                         lane 0.
 ======================================= ====================================================================== ==================================
 
 *Masked Builtins*
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 78dd26aa2c455..182ec64533f26 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1664,6 +1664,12 @@ def ReduceAdd : Builtin {
   let Prototype = "void(...)";
 }
 
+def ReduceAddf : Builtin {
+  let Spellings = ["__builtin_reduce_addf"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ReduceMul : Builtin {
   let Spellings = ["__builtin_reduce_mul"];
   let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 85c7d2cd5c489..87c57ab0cfbdb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1517,6 +1517,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and:
+  case Builtin::BI__builtin_reduce_addf:
   case Builtin::BI__builtin_reduce_maximum:
   case Builtin::BI__builtin_reduce_minimum:
   case Builtin::BI__builtin_matrix_transpose:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 850cc8d2c4c45..32fe47caaa97e 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4215,6 +4215,28 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_reduce_minimum:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::vector_reduce_fminimum, "rdx.minimum"));
+  case Builtin::BI__builtin_reduce_addf: {
+    llvm::Value *Vector = EmitScalarExpr(E->getArg(0));
+    llvm::Type *ScalarTy = Vector->getType()->getScalarType();
+    llvm::Value *StartValue = nullptr;
+    if (E->getNumArgs() == 2)
+      StartValue = Builder.CreateFPCast(EmitScalarExpr(E->getArg(1)), ScalarTy);
+    llvm::Value *Args[] = {/*start_value=*/StartValue
+                               ? StartValue
+                               : llvm::ConstantFP::get(ScalarTy, -0.0F),
+                           /*vector=*/Vector};
+    llvm::Function *F =
+        CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
+    llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
+    if (!StartValue) {
+      // No start value means an unordered reduction, which requires the reassoc
+      // FMF flag.
+      llvm::FastMathFlags FMF;
+      FMF.setAllowReassoc();
+      cast<llvm::CallBase>(Reduce)->setFastMathFlags(FMF);
+    }
+    return RValue::get(Reduce);
+  }
 
   case Builtin::BI__builtin_matrix_transpose: {
     auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 45006bfc11644..271515c1241ab 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2818,6 +2818,14 @@ static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
   return S.UsualUnaryFPConversions(Res.get());
 }
 
+static QualType GetVectorElementType(ASTContext &Context, QualType VecTy) {
+  if (const auto *TyA = VecTy->getAs<VectorType>())
+    return TyA->getElementType();
+  if (VecTy->isSizelessVectorType())
+    return VecTy->getSizelessVectorEltType(Context);
+  return QualType();
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -3668,14 +3676,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
 
     const Expr *Arg = TheCall->getArg(0);
-    const auto *TyA = Arg->getType()->getAs<VectorType>();
-
-    QualType ElTy;
-    if (TyA)
-      ElTy = TyA->getElementType();
-    else if (Arg->getType()->isSizelessVectorType())
-      ElTy = Arg->getType()->getSizelessVectorEltType(Context);
 
+    QualType ElTy = GetVectorElementType(Context, Arg->getType());
     if (ElTy.isNull() || !ElTy->isIntegerType()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector of */ 4 << /* int */ 1 << /* no fp */ 0
@@ -3687,6 +3689,43 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
+  case Builtin::BI__builtin_reduce_addf: {
+    if (checkArgCountRange(TheCall, 1, 2))
+      return ExprError();
+
+    ExprResult Vec = UsualUnaryConversions(TheCall->getArg(0));
+    if (Vec.isInvalid())
+      return ExprError();
+
+    TheCall->setArg(0, Vec.get());
+
+    QualType ElTy = GetVectorElementType(Context, Vec.get()->getType());
+    if (ElTy.isNull() || !ElTy->isRealFloatingType()) {
+      Diag(Vec.get()->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+          << 1 << /* vector of */ 4 << /* no int */ 0 << /* fp */ 1
+          << Vec.get()->getType();
+      return ExprError();
+    }
+
+    if (TheCall->getNumArgs() == 2) {
+      ExprResult StartValue = UsualUnaryConversions(TheCall->getArg(1));
+      if (StartValue.isInvalid())
+        return ExprError();
+
+      if (!StartValue.get()->getType()->isRealFloatingType()) {
+        Diag(StartValue.get()->getBeginLoc(),
+             diag::err_builtin_invalid_arg_type)
+            << 2 << /* scalar */ 1 << /* no int */ 0 << /* fp */ 1
+            << StartValue.get()->getType();
+        return ExprError();
+      }
+      TheCall->setArg(1, StartValue.get());
+    }
+
+    TheCall->setType(ElTy);
+    break;
+  }
+
   case Builtin::BI__builtin_matrix_transpose:
     return BuiltinMatrixTranspose(TheCall, TheCallResult);
 
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index e12fd729c84c0..bde6e9a4f9868 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -4,6 +4,8 @@
 // RUN: %clang_cc1 -O1 -triple aarch64 -target-feature +sve  %s -emit-llvm -disable-llvm-passes -o - | FileCheck --check-prefixes=SVE   %s
 
 typedef float float4 __attribute__((ext_vector_type(4)));
+typedef _Float16 half8 __attribute__((ext_vector_type(8)));
+
 typedef short int si8 __attribute__((ext_vector_type(8)));
 typedef unsigned int u4 __attribute__((ext_vector_type(4)));
 
@@ -162,6 +164,27 @@ void test_builtin_reduce_minimum(float4 vf1) {
   const double r4 = __builtin_reduce_minimum(vf1_as_one);
 }
 
+void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
+  // CHECK-LABEL: define void @test_builtin_reduce_addf(
+
+  // CHECK:      [[V0:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V0]])
+  float r1 = __builtin_reduce_addf(vf1);
+
+  // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V1]])
+  float r2 = __builtin_reduce_addf(vf1, 0.0f);
+
+  // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
+  // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
+  _Float16 r3 = __builtin_reduce_addf(vf2);
+
+  // CHECK:      [[V3:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
+  // CHECK-NEXT: [[RDX:%.+]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V3]])
+  // CHECK-NEXT: fpext half [[RDX]] to float
+  float r4 = __builtin_reduce_addf(vf2, -0.0f);
+}
+
 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 74f09d501198b..d4562d967e0e9 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -148,3 +148,20 @@ void test_builtin_reduce_minimum(int i, float4 v, int3 iv) {
   i = __builtin_reduce_minimum(i);
   // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int')}}
 }
+
+void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
+  struct Foo s = __builtin_reduce_addf(v);
+  // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+
+  f = __builtin_reduce_addf();
+  // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
+
+  f = __builtin_reduce_addf(v, f, v);
+  // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
+
+  f = __builtin_reduce_addf(iv);
+  // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
+
+  f = __builtin_reduce_addf(v, (int)121);
+  // expected-error at -1 {{2nd argument must be a scalar floating-point type (was 'int')}}
+}

>From 2b99bdeb2b5e3ccdacfed2098df10f625e0a9fa3 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 15 Jan 2026 13:37:34 +0000
Subject: [PATCH 2/8] Try to fix docs

---
 clang/docs/LanguageExtensions.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 2109679d20dcd..d2541be502862 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -948,8 +948,8 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
                                          for the comparison.
  ET __builtin_reduce_addf(VT a)          unordered floating-point add reduction.                                floating point types
  ET __builtin_reduce_addf(VT a, ET s)    ordered floating-point add reduction, initializing the accumulator     floating point types
-                                         with `(ET)s`, then adding each lane of the `a` in-order, starting from
-                                         lane 0.
+                                         with `(ET)s`, then adding each lane of the `a` in-order, starting
+                                         from lane 0.
 ======================================= ====================================================================== ==================================
 
 *Masked Builtins*

>From 4b0f3389159acca697538325fe017fccaed2cdef Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 26 Jan 2026 13:58:12 +0000
Subject: [PATCH 3/8] Fixups

---
 clang/docs/LanguageExtensions.rst            | 50 ++++++++++----------
 clang/include/clang/Basic/Builtins.td        |  6 +++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |  1 +
 clang/lib/CodeGen/CGBuiltin.cpp              |  9 ++--
 clang/lib/Sema/SemaChecking.cpp              |  9 ++--
 clang/test/CodeGen/builtins-reduction-math.c | 10 ++--
 clang/test/Sema/builtins-reduction-math.c    |  8 ++--
 7 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index d2541be502862..7322f69753809 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -926,31 +926,31 @@ Example:
 
 Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
 
-======================================= ====================================================================== ==================================
-         Name                            Operation                                                              Supported element types
-======================================= ====================================================================== ==================================
- ET __builtin_reduce_max(VT a)           return the largest element of the vector. The floating point result    integer and floating point types
-                                         will always be a number unless all elements of the vector are NaN.
- ET __builtin_reduce_min(VT a)           return the smallest element of the vector. The floating point result   integer and floating point types
-                                         will always be a number unless all elements of the vector are NaN.
- ET __builtin_reduce_add(VT a)           \+                                                                     integer types
- ET __builtin_reduce_mul(VT a)           \*                                                                     integer types
- ET __builtin_reduce_and(VT a)           &                                                                      integer types
- ET __builtin_reduce_or(VT a)            \|                                                                     integer types
- ET __builtin_reduce_xor(VT a)           ^                                                                      integer types
- ET __builtin_reduce_maximum(VT a)       return the largest element of the vector. Follows IEEE 754-2019        floating point types
-                                         semantics, see `LangRef
-                                         <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
-                                         for the comparison.
- ET __builtin_reduce_minimum(VT a)       return the smallest element of the vector. Follows IEEE 754-2019       floating point types
-                                         semantics, see `LangRef
-                                         <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
-                                         for the comparison.
- ET __builtin_reduce_addf(VT a)          unordered floating-point add reduction.                                floating point types
- ET __builtin_reduce_addf(VT a, ET s)    ordered floating-point add reduction, initializing the accumulator     floating point types
-                                         with `(ET)s`, then adding each lane of the `a` in-order, starting
-                                         from lane 0.
-======================================= ====================================================================== ==================================
+============================================= ====================================================================== ==================================
+         Name                                 Operation                                                              Supported element types
+============================================= ====================================================================== ==================================
+ ET __builtin_reduce_max(VT a)                 return the largest element of the vector. The floating point result    integer and floating point types
+                                               will always be a number unless all elements of the vector are NaN.
+ ET __builtin_reduce_min(VT a)                 return the smallest element of the vector. The floating point result   integer and floating point types
+                                               will always be a number unless all elements of the vector are NaN.
+ ET __builtin_reduce_add(VT a)                 \+                                                                     integer types
+ ET __builtin_reduce_mul(VT a)                 \*                                                                     integer types
+ ET __builtin_reduce_and(VT a)                 &                                                                      integer types
+ ET __builtin_reduce_or(VT a)                  \|                                                                     integer types
+ ET __builtin_reduce_xor(VT a)                 ^                                                                      integer types
+ ET __builtin_reduce_maximum(VT a)             return the largest element of the vector. Follows IEEE 754-2019        floating point types
+                                               semantics, see `LangRef
+                                               <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
+                                               for the comparison.
+ ET __builtin_reduce_minimum(VT a)             return the smallest element of the vector. Follows IEEE 754-2019       floating point types
+                                               semantics, see `LangRef
+                                               <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
+                                               for the comparison.
+ ET __builtin_reduce_addf(VT a)                unordered floating-point add reduction.                                floating point types
+ ET __builtin_ordered_reduce_addf(VT a, ET s)  ordered floating-point add reduction, initializing the accumulator     floating point types
+                                               with `(ET)s`, then adding each lane of the `a` in-order, starting
+                                               from lane 0.
+============================================= ====================================================================== ==================================
 
 *Masked Builtins*
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 182ec64533f26..59b76e349bb71 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1670,6 +1670,12 @@ def ReduceAddf : Builtin {
   let Prototype = "void(...)";
 }
 
+def OrderedReduceAddf : Builtin {
+  let Spellings = ["__builtin_ordered_reduce_addf"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 def ReduceMul : Builtin {
   let Spellings = ["__builtin_reduce_mul"];
   let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 87c57ab0cfbdb..70f9ba5edb783 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1518,6 +1518,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and:
   case Builtin::BI__builtin_reduce_addf:
+  case Builtin::BI__builtin_ordered_reduce_addf:
   case Builtin::BI__builtin_reduce_maximum:
   case Builtin::BI__builtin_reduce_minimum:
   case Builtin::BI__builtin_matrix_transpose:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 32fe47caaa97e..711c9754de76f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4215,7 +4215,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_reduce_minimum:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::vector_reduce_fminimum, "rdx.minimum"));
-  case Builtin::BI__builtin_reduce_addf: {
+  case Builtin::BI__builtin_reduce_addf:
+  case Builtin::BI__builtin_ordered_reduce_addf: {
     llvm::Value *Vector = EmitScalarExpr(E->getArg(0));
     llvm::Type *ScalarTy = Vector->getType()->getScalarType();
     llvm::Value *StartValue = nullptr;
@@ -4228,9 +4229,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
     llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
-    if (!StartValue) {
-      // No start value means an unordered reduction, which requires the reassoc
-      // FMF flag.
+    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_addf) {
+      // `__builtin_reduce_addf` an unordered reduction, which requires the
+      // reassoc FMF flag.
       llvm::FastMathFlags FMF;
       FMF.setAllowReassoc();
       cast<llvm::CallBase>(Reduce)->setFastMathFlags(FMF);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 271515c1241ab..84bccd20c7765 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2818,7 +2818,7 @@ static ExprResult BuiltinVectorMathConversions(Sema &S, Expr *E) {
   return S.UsualUnaryFPConversions(Res.get());
 }
 
-static QualType GetVectorElementType(ASTContext &Context, QualType VecTy) {
+static QualType getVectorElementType(ASTContext &Context, QualType VecTy) {
   if (const auto *TyA = VecTy->getAs<VectorType>())
     return TyA->getElementType();
   if (VecTy->isSizelessVectorType())
@@ -3677,7 +3677,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     const Expr *Arg = TheCall->getArg(0);
 
-    QualType ElTy = GetVectorElementType(Context, Arg->getType());
+    QualType ElTy = getVectorElementType(Context, Arg->getType());
     if (ElTy.isNull() || !ElTy->isIntegerType()) {
       Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector of */ 4 << /* int */ 1 << /* no fp */ 0
@@ -3689,7 +3689,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
-  case Builtin::BI__builtin_reduce_addf: {
+  case Builtin::BI__builtin_reduce_addf:
+  case Builtin::BI__builtin_ordered_reduce_addf: {
     if (checkArgCountRange(TheCall, 1, 2))
       return ExprError();
 
@@ -3699,7 +3700,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
     TheCall->setArg(0, Vec.get());
 
-    QualType ElTy = GetVectorElementType(Context, Vec.get()->getType());
+    QualType ElTy = getVectorElementType(Context, Vec.get()->getType());
     if (ElTy.isNull() || !ElTy->isRealFloatingType()) {
       Diag(Vec.get()->getBeginLoc(), diag::err_builtin_invalid_arg_type)
           << 1 << /* vector of */ 4 << /* no int */ 0 << /* fp */ 1
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index bde6e9a4f9868..2c69315419882 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -168,12 +168,12 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
   // CHECK-LABEL: define void @test_builtin_reduce_addf(
 
   // CHECK:      [[V0:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
-  // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V0]])
-  float r1 = __builtin_reduce_addf(vf1);
+  // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> [[V0]])
+  float r1 = __builtin_reduce_addf(vf1, 1.0f);
 
   // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V1]])
-  float r2 = __builtin_reduce_addf(vf1, 0.0f);
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V1]])
+  float r2 = __builtin_ordered_reduce_addf(vf1);
 
   // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
@@ -182,7 +182,7 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
   // CHECK:      [[V3:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: [[RDX:%.+]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V3]])
   // CHECK-NEXT: fpext half [[RDX]] to float
-  float r4 = __builtin_reduce_addf(vf2, -0.0f);
+  float r4 = __builtin_ordered_reduce_addf(vf2, -0.0f);
 }
 
 #if defined(__ARM_FEATURE_SVE)
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index d4562d967e0e9..3ca5b5755a53e 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -153,15 +153,15 @@ void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
   struct Foo s = __builtin_reduce_addf(v);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
 
+  f = __builtin_ordered_reduce_addf(v, f, f);
+  // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
+
   f = __builtin_reduce_addf();
   // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
 
-  f = __builtin_reduce_addf(v, f, v);
-  // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
-
   f = __builtin_reduce_addf(iv);
   // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
 
-  f = __builtin_reduce_addf(v, (int)121);
+  f = __builtin_ordered_reduce_addf(v, (int)121);
   // expected-error at -1 {{2nd argument must be a scalar floating-point type (was 'int')}}
 }

>From 376e17647da4bb0fd82e422a6b153a94232e1722 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 26 Jan 2026 17:55:15 +0000
Subject: [PATCH 4/8] Rename

---
 clang/docs/LanguageExtensions.rst            | 12 ++++++------
 clang/include/clang/Basic/Builtins.td        |  4 ++--
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |  4 ++--
 clang/lib/CodeGen/CGBuiltin.cpp              |  8 ++++----
 clang/lib/Sema/SemaChecking.cpp              |  4 ++--
 clang/test/CodeGen/builtins-reduction-math.c |  8 ++++----
 clang/test/Sema/builtins-reduction-math.c    | 10 +++++-----
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 7322f69753809..5a25dcf1f35a0 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -946,8 +946,8 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
                                                semantics, see `LangRef
                                                <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
                                                for the comparison.
- ET __builtin_reduce_addf(VT a)                unordered floating-point add reduction.                                floating point types
- ET __builtin_ordered_reduce_addf(VT a, ET s)  ordered floating-point add reduction, initializing the accumulator     floating point types
+ ET __builtin_reduce_fadd(VT a)                unordered floating-point add reduction.                                floating point types
+ ET __builtin_ordered_reduce_fadd(VT a, ET s)  ordered floating-point add reduction, initializing the accumulator     floating point types
                                                with `(ET)s`, then adding each lane of the `a` in-order, starting
                                                from lane 0.
 ============================================= ====================================================================== ==================================
@@ -979,15 +979,15 @@ Example:
     using v8i = int [[clang::ext_vector_type(8)]];
 
     v8i load(v8b mask, int *ptr) { return __builtin_masked_load(mask, ptr); }
-    
+
     v8i load_expand(v8b mask, int *ptr) {
       return __builtin_masked_expand_load(mask, ptr);
     }
-    
+
     void store(v8b mask, v8i val, int *ptr) {
       __builtin_masked_store(mask, val, ptr);
     }
-    
+
     void store_compress(v8b mask, v8i val, int *ptr) {
       __builtin_masked_compress_store(mask, val, ptr);
     }
@@ -1079,7 +1079,7 @@ The matrix type extension supports explicit casts. Implicit type conversion betw
 
 The matrix type extension supports column and row major memory layouts, but not
 all builtins are supported with row-major layout. The layout defaults to column
-major and can be specified using `-fmatrix-memory-layout`. To enable column 
+major and can be specified using `-fmatrix-memory-layout`. To enable column
 major layout, use `-fmatrix-memory-layout=column-major`, and for row major
 layout use `-fmatrix-memory-layout=row-major`
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 59b76e349bb71..c328d011c05c0 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1665,13 +1665,13 @@ def ReduceAdd : Builtin {
 }
 
 def ReduceAddf : Builtin {
-  let Spellings = ["__builtin_reduce_addf"];
+  let Spellings = ["__builtin_reduce_fadd"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
   let Prototype = "void(...)";
 }
 
 def OrderedReduceAddf : Builtin {
-  let Spellings = ["__builtin_ordered_reduce_addf"];
+  let Spellings = ["__builtin_ordered_reduce_fadd"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
   let Prototype = "void(...)";
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 70f9ba5edb783..ffc9df52c7a8e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1517,8 +1517,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and:
-  case Builtin::BI__builtin_reduce_addf:
-  case Builtin::BI__builtin_ordered_reduce_addf:
+  case Builtin::BI__builtin_reduce_fadd:
+  case Builtin::BI__builtin_ordered_reduce_fadd:
   case Builtin::BI__builtin_reduce_maximum:
   case Builtin::BI__builtin_reduce_minimum:
   case Builtin::BI__builtin_matrix_transpose:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 711c9754de76f..dfd2d5e899d09 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4215,8 +4215,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_reduce_minimum:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::vector_reduce_fminimum, "rdx.minimum"));
-  case Builtin::BI__builtin_reduce_addf:
-  case Builtin::BI__builtin_ordered_reduce_addf: {
+  case Builtin::BI__builtin_reduce_fadd:
+  case Builtin::BI__builtin_ordered_reduce_fadd: {
     llvm::Value *Vector = EmitScalarExpr(E->getArg(0));
     llvm::Type *ScalarTy = Vector->getType()->getScalarType();
     llvm::Value *StartValue = nullptr;
@@ -4229,8 +4229,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
     llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
-    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_addf) {
-      // `__builtin_reduce_addf` an unordered reduction, which requires the
+    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_fadd) {
+      // `__builtin_reduce_fadd` an unordered reduction, which requires the
       // reassoc FMF flag.
       llvm::FastMathFlags FMF;
       FMF.setAllowReassoc();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 84bccd20c7765..fc641f0515d5f 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3689,8 +3689,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
-  case Builtin::BI__builtin_reduce_addf:
-  case Builtin::BI__builtin_ordered_reduce_addf: {
+  case Builtin::BI__builtin_reduce_fadd:
+  case Builtin::BI__builtin_ordered_reduce_fadd: {
     if (checkArgCountRange(TheCall, 1, 2))
       return ExprError();
 
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 2c69315419882..99f1596e3ff54 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -169,20 +169,20 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
 
   // CHECK:      [[V0:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> [[V0]])
-  float r1 = __builtin_reduce_addf(vf1, 1.0f);
+  float r1 = __builtin_reduce_fadd(vf1, 1.0f);
 
   // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V1]])
-  float r2 = __builtin_ordered_reduce_addf(vf1);
+  float r2 = __builtin_ordered_reduce_fadd(vf1);
 
   // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
-  _Float16 r3 = __builtin_reduce_addf(vf2);
+  _Float16 r3 = __builtin_reduce_fadd(vf2);
 
   // CHECK:      [[V3:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: [[RDX:%.+]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V3]])
   // CHECK-NEXT: fpext half [[RDX]] to float
-  float r4 = __builtin_ordered_reduce_addf(vf2, -0.0f);
+  float r4 = __builtin_ordered_reduce_fadd(vf2, -0.0f);
 }
 
 #if defined(__ARM_FEATURE_SVE)
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 3ca5b5755a53e..738b54dc929dd 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -150,18 +150,18 @@ void test_builtin_reduce_minimum(int i, float4 v, int3 iv) {
 }
 
 void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
-  struct Foo s = __builtin_reduce_addf(v);
+  struct Foo s = __builtin_reduce_fadd(v);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
 
-  f = __builtin_ordered_reduce_addf(v, f, f);
+  f = __builtin_ordered_reduce_fadd(v, f, f);
   // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
 
-  f = __builtin_reduce_addf();
+  f = __builtin_reduce_fadd();
   // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
 
-  f = __builtin_reduce_addf(iv);
+  f = __builtin_reduce_fadd(iv);
   // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
 
-  f = __builtin_ordered_reduce_addf(v, (int)121);
+  f = __builtin_ordered_reduce_fadd(v, (int)121);
   // expected-error at -1 {{2nd argument must be a scalar floating-point type (was 'int')}}
 }

>From b5c55c2d05669e4a75a32b0a8946c530b92bcdaf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 27 Jan 2026 11:36:18 +0000
Subject: [PATCH 5/8] Fixups

---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dfd2d5e899d09..c1a50bbe1c963 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4230,7 +4230,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
     llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
     if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_fadd) {
-      // `__builtin_reduce_fadd` an unordered reduction, which requires the
+      // `__builtin_reduce_fadd` is an unordered reduction which requires the
       // reassoc FMF flag.
       llvm::FastMathFlags FMF;
       FMF.setAllowReassoc();

>From 94c546ba73d9645c2483c5a7e0a3371ebe3551a7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 19 Feb 2026 12:58:23 +0000
Subject: [PATCH 6/8] Tweak names

---
 clang/docs/LanguageExtensions.rst            | 50 ++++++++++----------
 clang/include/clang/Basic/Builtins.td        |  8 ++--
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      |  4 +-
 clang/lib/CodeGen/CGBuiltin.cpp              | 10 ++--
 clang/lib/Sema/SemaChecking.cpp              |  4 +-
 clang/test/CodeGen/builtins-reduction-math.c |  8 ++--
 clang/test/Sema/builtins-reduction-math.c    | 10 ++--
 7 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 5a25dcf1f35a0..29eda1121dbcb 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -926,31 +926,31 @@ Example:
 
 Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
 
-============================================= ====================================================================== ==================================
-         Name                                 Operation                                                              Supported element types
-============================================= ====================================================================== ==================================
- ET __builtin_reduce_max(VT a)                 return the largest element of the vector. The floating point result    integer and floating point types
-                                               will always be a number unless all elements of the vector are NaN.
- ET __builtin_reduce_min(VT a)                 return the smallest element of the vector. The floating point result   integer and floating point types
-                                               will always be a number unless all elements of the vector are NaN.
- ET __builtin_reduce_add(VT a)                 \+                                                                     integer types
- ET __builtin_reduce_mul(VT a)                 \*                                                                     integer types
- ET __builtin_reduce_and(VT a)                 &                                                                      integer types
- ET __builtin_reduce_or(VT a)                  \|                                                                     integer types
- ET __builtin_reduce_xor(VT a)                 ^                                                                      integer types
- ET __builtin_reduce_maximum(VT a)             return the largest element of the vector. Follows IEEE 754-2019        floating point types
-                                               semantics, see `LangRef
-                                               <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
-                                               for the comparison.
- ET __builtin_reduce_minimum(VT a)             return the smallest element of the vector. Follows IEEE 754-2019       floating point types
-                                               semantics, see `LangRef
-                                               <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
-                                               for the comparison.
- ET __builtin_reduce_fadd(VT a)                unordered floating-point add reduction.                                floating point types
- ET __builtin_ordered_reduce_fadd(VT a, ET s)  ordered floating-point add reduction, initializing the accumulator     floating point types
-                                               with `(ET)s`, then adding each lane of the `a` in-order, starting
-                                               from lane 0.
-============================================= ====================================================================== ==================================
+============================================== ====================================================================== ==================================
+         Name                                   Operation                                                              Supported element types
+============================================== ====================================================================== ==================================
+ ET __builtin_reduce_max(VT a)                  return the largest element of the vector. The floating point result    integer and floating point types
+                                                will always be a number unless all elements of the vector are NaN.
+ ET __builtin_reduce_min(VT a)                  return the smallest element of the vector. The floating point result   integer and floating point types
+                                                will always be a number unless all elements of the vector are NaN.
+ ET __builtin_reduce_add(VT a)                  \+                                                                     integer types
+ ET __builtin_reduce_mul(VT a)                  \*                                                                     integer types
+ ET __builtin_reduce_and(VT a)                  &                                                                      integer types
+ ET __builtin_reduce_or(VT a)                   \|                                                                     integer types
+ ET __builtin_reduce_xor(VT a)                  ^                                                                      integer types
+ ET __builtin_reduce_maximum(VT a)              return the largest element of the vector. Follows IEEE 754-2019        floating point types
+                                                semantics, see `LangRef
+                                                <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
+                                                for the comparison.
+ ET __builtin_reduce_minimum(VT a)              return the smallest element of the vector. Follows IEEE 754-2019       floating point types
+                                                semantics, see `LangRef
+                                                <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
+                                                for the comparison.
+ ET __builtin_reduce_any_order_fadd(VT a)       floating-point associative fadd reduction.                              floating point types
+ ET __builtin_reduce_in_order_fadd(VT a, ET s)  in order floating-point fadd reduction, initializing the accumulator    floating point types
+                                                with `(ET)s`, then adding each lane of the `a` in-order, starting
+                                                from lane 0.
+============================================== ====================================================================== ==================================
 
 *Masked Builtins*
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index c328d011c05c0..44c3d9d808f2b 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1664,14 +1664,14 @@ def ReduceAdd : Builtin {
   let Prototype = "void(...)";
 }
 
-def ReduceAddf : Builtin {
-  let Spellings = ["__builtin_reduce_fadd"];
+def ReduceInOrderFAdd : Builtin {
+  let Spellings = ["__builtin_reduce_in_order_fadd"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
   let Prototype = "void(...)";
 }
 
-def OrderedReduceAddf : Builtin {
-  let Spellings = ["__builtin_ordered_reduce_fadd"];
+def ReduceAnyOrderFAdd : Builtin {
+  let Spellings = ["__builtin_reduce_any_order_fadd"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
   let Prototype = "void(...)";
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index ffc9df52c7a8e..903b52eb58a56 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1517,8 +1517,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and:
-  case Builtin::BI__builtin_reduce_fadd:
-  case Builtin::BI__builtin_ordered_reduce_fadd:
+  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_in_order_fadd:
   case Builtin::BI__builtin_reduce_maximum:
   case Builtin::BI__builtin_reduce_minimum:
   case Builtin::BI__builtin_matrix_transpose:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c1a50bbe1c963..d79b9e8e0dd90 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4215,8 +4215,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_reduce_minimum:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::vector_reduce_fminimum, "rdx.minimum"));
-  case Builtin::BI__builtin_reduce_fadd:
-  case Builtin::BI__builtin_ordered_reduce_fadd: {
+  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_in_order_fadd: {
     llvm::Value *Vector = EmitScalarExpr(E->getArg(0));
     llvm::Type *ScalarTy = Vector->getType()->getScalarType();
     llvm::Value *StartValue = nullptr;
@@ -4229,9 +4229,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
     llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
-    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_fadd) {
-      // `__builtin_reduce_fadd` is an unordered reduction which requires the
-      // reassoc FMF flag.
+    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_any_order_fadd) {
+      // `__builtin_reduce_any_order_fadd` is an associative reduction which
+      // requires the reassoc FMF flag.
       llvm::FastMathFlags FMF;
       FMF.setAllowReassoc();
       cast<llvm::CallBase>(Reduce)->setFastMathFlags(FMF);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index fc641f0515d5f..ada9356b78bf9 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3689,8 +3689,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
-  case Builtin::BI__builtin_reduce_fadd:
-  case Builtin::BI__builtin_ordered_reduce_fadd: {
+  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_in_order_fadd: {
     if (checkArgCountRange(TheCall, 1, 2))
       return ExprError();
 
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 99f1596e3ff54..99a2ba110f798 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -169,20 +169,20 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
 
   // CHECK:      [[V0:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> [[V0]])
-  float r1 = __builtin_reduce_fadd(vf1, 1.0f);
+  float r1 = __builtin_reduce_any_order_fadd(vf1, 1.0f);
 
   // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V1]])
-  float r2 = __builtin_ordered_reduce_fadd(vf1);
+  float r2 = __builtin_reduce_in_order_fadd(vf1);
 
   // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
-  _Float16 r3 = __builtin_reduce_fadd(vf2);
+  _Float16 r3 = __builtin_reduce_any_order_fadd(vf2);
 
   // CHECK:      [[V3:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: [[RDX:%.+]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V3]])
   // CHECK-NEXT: fpext half [[RDX]] to float
-  float r4 = __builtin_ordered_reduce_fadd(vf2, -0.0f);
+  float r4 = __builtin_reduce_in_order_fadd(vf2, -0.0f);
 }
 
 #if defined(__ARM_FEATURE_SVE)
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 738b54dc929dd..dc1875096f6db 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -150,18 +150,18 @@ void test_builtin_reduce_minimum(int i, float4 v, int3 iv) {
 }
 
 void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
-  struct Foo s = __builtin_reduce_fadd(v);
+  struct Foo s = __builtin_reduce_any_order_fadd(v);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
 
-  f = __builtin_ordered_reduce_fadd(v, f, f);
+  f = __builtin_reduce_in_order_fadd(v, f, f);
   // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
 
-  f = __builtin_reduce_fadd();
+  f = __builtin_reduce_any_order_fadd();
   // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
 
-  f = __builtin_reduce_fadd(iv);
+  f = __builtin_reduce_any_order_fadd(iv);
   // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
 
-  f = __builtin_ordered_reduce_fadd(v, (int)121);
+  f = __builtin_reduce_in_order_fadd(v, (int)121);
   // expected-error at -1 {{2nd argument must be a scalar floating-point type (was 'int')}}
 }

>From b1a9cf1ce71a29158bd13005a016b3d0c23f3325 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 20 Feb 2026 15:12:09 +0000
Subject: [PATCH 7/8] Fixups

---
 clang/lib/Sema/SemaChecking.cpp              | 4 +++-
 clang/test/CodeGen/builtins-reduction-math.c | 4 ++--
 clang/test/Sema/builtins-reduction-math.c    | 5 ++++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ada9356b78bf9..00a8dad741361 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3691,7 +3691,9 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 
   case Builtin::BI__builtin_reduce_any_order_fadd:
   case Builtin::BI__builtin_reduce_in_order_fadd: {
-    if (checkArgCountRange(TheCall, 1, 2))
+    // For in-order reductions require the user to specify the start value.
+    bool InOrder = BuiltinID == Builtin::BI__builtin_reduce_in_order_fadd;
+    if (InOrder ? checkArgCount(TheCall, 2) : checkArgCountRange(TheCall, 1, 2))
       return ExprError();
 
     ExprResult Vec = UsualUnaryConversions(TheCall->getArg(0));
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 99a2ba110f798..764160e0fd48d 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -172,8 +172,8 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
   float r1 = __builtin_reduce_any_order_fadd(vf1, 1.0f);
 
   // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
-  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V1]])
-  float r2 = __builtin_reduce_in_order_fadd(vf1);
+  // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 2.000000e+00, <4 x float> [[V1]])
+  float r2 = __builtin_reduce_in_order_fadd(vf1, 2.0f);
 
   // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index dc1875096f6db..0d1cc8cd52b7b 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -153,8 +153,11 @@ void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
   struct Foo s = __builtin_reduce_any_order_fadd(v);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
 
+  f = __builtin_reduce_in_order_fadd(v);
+  // expected-error at -1 {{too few arguments to function call, expected 2, have 1}}
+
   f = __builtin_reduce_in_order_fadd(v, f, f);
-  // expected-error at -1 {{too many arguments to function call, expected at most 2, have 3}}
+  // expected-error at -1 {{too many arguments to function call, expected 2, have 3}}
 
   f = __builtin_reduce_any_order_fadd();
   // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}

>From 45f56c91bd7936910f41b7f4f10aeaceebe7b999 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 20 Feb 2026 15:24:07 +0000
Subject: [PATCH 8/8] Tweak names

---
 clang/docs/LanguageExtensions.rst            | 2 +-
 clang/include/clang/Basic/Builtins.td        | 2 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp      | 2 +-
 clang/lib/CodeGen/CGBuiltin.cpp              | 6 +++---
 clang/lib/Sema/SemaChecking.cpp              | 2 +-
 clang/test/CodeGen/builtins-reduction-math.c | 4 ++--
 clang/test/Sema/builtins-reduction-math.c    | 6 +++---
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 29eda1121dbcb..f1403faaa2860 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -946,7 +946,7 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
                                                 semantics, see `LangRef
                                                 <http://llvm.org/docs/LangRef.html#i-fminmax-family>`_
                                                 for the comparison.
- ET __builtin_reduce_any_order_fadd(VT a)       floating-point associative fadd reduction.                              floating point types
+ ET __builtin_reduce_assoc_fadd(VT a)           floating-point associative fadd reduction.                              floating point types
  ET __builtin_reduce_in_order_fadd(VT a, ET s)  in order floating-point fadd reduction, initializing the accumulator    floating point types
                                                 with `(ET)s`, then adding each lane of the `a` in-order, starting
                                                 from lane 0.
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 44c3d9d808f2b..0a2183e7fb1ea 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1671,7 +1671,7 @@ def ReduceInOrderFAdd : Builtin {
 }
 
 def ReduceAnyOrderFAdd : Builtin {
-  let Spellings = ["__builtin_reduce_any_order_fadd"];
+  let Spellings = ["__builtin_reduce_assoc_fadd"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
   let Prototype = "void(...)";
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 903b52eb58a56..9879b18d2b763 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1517,7 +1517,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_reduce_xor:
   case Builtin::BI__builtin_reduce_or:
   case Builtin::BI__builtin_reduce_and:
-  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_assoc_fadd:
   case Builtin::BI__builtin_reduce_in_order_fadd:
   case Builtin::BI__builtin_reduce_maximum:
   case Builtin::BI__builtin_reduce_minimum:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d79b9e8e0dd90..38010cad75244 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4215,7 +4215,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_reduce_minimum:
     return RValue::get(emitBuiltinWithOneOverloadedType<1>(
         *this, E, Intrinsic::vector_reduce_fminimum, "rdx.minimum"));
-  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_assoc_fadd:
   case Builtin::BI__builtin_reduce_in_order_fadd: {
     llvm::Value *Vector = EmitScalarExpr(E->getArg(0));
     llvm::Type *ScalarTy = Vector->getType()->getScalarType();
@@ -4229,8 +4229,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Vector->getType());
     llvm::CallBase *Reduce = Builder.CreateCall(F, Args, "rdx.addf");
-    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_any_order_fadd) {
-      // `__builtin_reduce_any_order_fadd` is an associative reduction which
+    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_reduce_assoc_fadd) {
+      // `__builtin_reduce_assoc_fadd` is an associative reduction which
       // requires the reassoc FMF flag.
       llvm::FastMathFlags FMF;
       FMF.setAllowReassoc();
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 00a8dad741361..2e76fe37739f1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3689,7 +3689,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     break;
   }
 
-  case Builtin::BI__builtin_reduce_any_order_fadd:
+  case Builtin::BI__builtin_reduce_assoc_fadd:
   case Builtin::BI__builtin_reduce_in_order_fadd: {
     // For in-order reductions require the user to specify the start value.
     bool InOrder = BuiltinID == Builtin::BI__builtin_reduce_in_order_fadd;
diff --git a/clang/test/CodeGen/builtins-reduction-math.c b/clang/test/CodeGen/builtins-reduction-math.c
index 764160e0fd48d..aacea50b97bb5 100644
--- a/clang/test/CodeGen/builtins-reduction-math.c
+++ b/clang/test/CodeGen/builtins-reduction-math.c
@@ -169,7 +169,7 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
 
   // CHECK:      [[V0:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call reassoc float @llvm.vector.reduce.fadd.v4f32(float 1.000000e+00, <4 x float> [[V0]])
-  float r1 = __builtin_reduce_any_order_fadd(vf1, 1.0f);
+  float r1 = __builtin_reduce_assoc_fadd(vf1, 1.0f);
 
   // CHECK:      [[V1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
   // CHECK-NEXT: call float @llvm.vector.reduce.fadd.v4f32(float 2.000000e+00, <4 x float> [[V1]])
@@ -177,7 +177,7 @@ void test_builtin_reduce_addf(float4 vf1, half8 vf2) {
 
   // CHECK:      [[V2:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V2:%.+]])
-  _Float16 r3 = __builtin_reduce_any_order_fadd(vf2);
+  _Float16 r3 = __builtin_reduce_assoc_fadd(vf2);
 
   // CHECK:      [[V3:%.+]] = load <8 x half>, ptr %vf2.addr, align 16
   // CHECK-NEXT: [[RDX:%.+]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[V3]])
diff --git a/clang/test/Sema/builtins-reduction-math.c b/clang/test/Sema/builtins-reduction-math.c
index 0d1cc8cd52b7b..5270de644356e 100644
--- a/clang/test/Sema/builtins-reduction-math.c
+++ b/clang/test/Sema/builtins-reduction-math.c
@@ -150,7 +150,7 @@ void test_builtin_reduce_minimum(int i, float4 v, int3 iv) {
 }
 
 void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
-  struct Foo s = __builtin_reduce_any_order_fadd(v);
+  struct Foo s = __builtin_reduce_assoc_fadd(v);
   // expected-error at -1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
 
   f = __builtin_reduce_in_order_fadd(v);
@@ -159,10 +159,10 @@ void test_builtin_reduce_addf(float f, float4 v, int3 iv) {
   f = __builtin_reduce_in_order_fadd(v, f, f);
   // expected-error at -1 {{too many arguments to function call, expected 2, have 3}}
 
-  f = __builtin_reduce_any_order_fadd();
+  f = __builtin_reduce_assoc_fadd();
   // expected-error at -1 {{too few arguments to function call, expected 1, have 0}}
 
-  f = __builtin_reduce_any_order_fadd(iv);
+  f = __builtin_reduce_assoc_fadd(iv);
   // expected-error at -1 {{1st argument must be a vector of floating-point types (was 'int3' (vector of 3 'int' values))}}
 
   f = __builtin_reduce_in_order_fadd(v, (int)121);